~lzh/A133.git

/*
 * Copyright (C) 2018 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 
#include "annotator/number/number.h"
 
#include <string>
#include <vector>
 
#include "annotator/collections.h"
#include "annotator/model_generated.h"
#include "annotator/types-test-util.h"
#include "annotator/types.h"
#include "utils/test-utils.h"
#include "utils/utf8/unicodetext.h"
#include "utils/utf8/unilib.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
 
namespace libtextclassifier3 {
namespace {
 
using testing::AllOf;
using testing::ElementsAre;
using testing::Field;
 
const NumberAnnotatorOptions* TestingNumberAnnotatorOptions() {
  static const flatbuffers::DetachedBuffer* options_data = []() {
    NumberAnnotatorOptionsT options;
    options.enabled = true;
    options.allowed_prefix_codepoints.push_back('$');
    options.allowed_suffix_codepoints.push_back('%');
 
    flatbuffers::FlatBufferBuilder builder;
    builder.Finish(NumberAnnotatorOptions::Pack(builder, &options));
    return new flatbuffers::DetachedBuffer(builder.Release());
  }();
 
  return flatbuffers::GetRoot<NumberAnnotatorOptions>(options_data->data());
}
 
FeatureProcessor BuildFeatureProcessor(const UniLib* unilib) {
  static const flatbuffers::DetachedBuffer* options_data = []() {
    FeatureProcessorOptionsT options;
    options.context_size = 1;
    options.max_selection_span = 1;
    options.snap_label_span_boundaries_to_containing_tokens = false;
    options.ignored_span_boundary_codepoints.push_back(',');
 
    options.tokenization_codepoint_config.emplace_back(
        new TokenizationCodepointRangeT());
    auto& config = options.tokenization_codepoint_config.back();
    config->start = 32;
    config->end = 33;
    config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
 
    flatbuffers::FlatBufferBuilder builder;
    builder.Finish(FeatureProcessorOptions::Pack(builder, &options));
    return new flatbuffers::DetachedBuffer(builder.Release());
  }();
 
  const FeatureProcessorOptions* feature_processor_options =
      flatbuffers::GetRoot<FeatureProcessorOptions>(options_data->data());
 
  return FeatureProcessor(feature_processor_options, unilib);
}
 
class NumberAnnotatorTest : public ::testing::Test {
 protected:
  NumberAnnotatorTest()
      : INIT_UNILIB_FOR_TESTING(unilib_),
        feature_processor_(BuildFeatureProcessor(&unilib_)),
        number_annotator_(TestingNumberAnnotatorOptions(),
                          &feature_processor_) {}
 
  UniLib unilib_;
  FeatureProcessor feature_processor_;
  NumberAnnotator number_annotator_;
};
 
TEST_F(NumberAnnotatorTest, ClassifiesAndParsesNumberCorrectly) {
  ClassificationResult classification_result;
  EXPECT_TRUE(number_annotator_.ClassifyText(
      UTF8ToUnicodeText("... 12345 ..."), {4, 9},
      AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
 
  EXPECT_EQ(classification_result.collection, "number");
  EXPECT_EQ(classification_result.numeric_value, 12345);
}
 
TEST_F(NumberAnnotatorTest, ClassifiesNonNumberCorrectly) {
  ClassificationResult classification_result;
  EXPECT_FALSE(number_annotator_.ClassifyText(
      UTF8ToUnicodeText("... 123a45 ..."), {4, 10},
      AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
 
TEST_F(NumberAnnotatorTest, FindsAllNumbersInText) {
  std::vector<AnnotatedSpan> result;
  EXPECT_TRUE(number_annotator_.FindAll(
      UTF8ToUnicodeText("... 12345 ... 9 is my number and I paid $99 and "
                        "sometimes 27% but not 68# nor #68"),
      AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
 
  ASSERT_EQ(result.size(), 4);
  ASSERT_EQ(result[0].classification.size(), 1);
  EXPECT_EQ(result[0].classification[0].collection, "number");
  EXPECT_EQ(result[0].classification[0].numeric_value, 12345);
  ASSERT_EQ(result[1].classification.size(), 1);
  EXPECT_EQ(result[1].classification[0].collection, "number");
  EXPECT_EQ(result[1].classification[0].numeric_value, 9);
  ASSERT_EQ(result[2].classification.size(), 1);
  EXPECT_EQ(result[2].classification[0].collection, "number");
  EXPECT_EQ(result[2].classification[0].numeric_value, 99);
  ASSERT_EQ(result[3].classification.size(), 1);
  EXPECT_EQ(result[3].classification[0].collection, "number");
  EXPECT_EQ(result[3].classification[0].numeric_value, 27);
}
 
TEST_F(NumberAnnotatorTest, FindsNumberWithPunctuation) {
  std::vector<AnnotatedSpan> result;
  EXPECT_TRUE(number_annotator_.FindAll(
      UTF8ToUnicodeText("Come at 9, ok?"),
      AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
 
  EXPECT_THAT(
      result,
      ElementsAre(
          AllOf(Field(&AnnotatedSpan::span, CodepointSpan(8, 9)),
                Field(&AnnotatedSpan::classification,
                      ElementsAre(AllOf(
                          Field(&ClassificationResult::collection, "number"),
                          Field(&ClassificationResult::numeric_value, 9)))))));
}
 
TEST_F(NumberAnnotatorTest, HandlesNumbersAtBeginning) {
  std::vector<AnnotatedSpan> result;
  EXPECT_TRUE(number_annotator_.FindAll(
      UTF8ToUnicodeText("-5"), AnnotationUsecase_ANNOTATION_USECASE_RAW,
      &result));
 
  EXPECT_THAT(
      result,
      ElementsAre(
          AllOf(Field(&AnnotatedSpan::span, CodepointSpan(0, 2)),
                Field(&AnnotatedSpan::classification,
                      ElementsAre(AllOf(
                          Field(&ClassificationResult::collection, "number"),
                          Field(&ClassificationResult::numeric_value, -5)))))));
}
 
TEST_F(NumberAnnotatorTest, WhenLowestSupportedNumberParsesIt) {
  ClassificationResult classification_result;
  EXPECT_TRUE(number_annotator_.ClassifyText(
      UTF8ToUnicodeText("-999999999999999999"), {0, 19},
      AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
 
  EXPECT_THAT(
      classification_result,
      AllOf(Field(&ClassificationResult::collection, "number"),
            Field(&ClassificationResult::numeric_value, -999999999999999999L)));
}
 
TEST_F(NumberAnnotatorTest, WhenLargestSupportedNumberParsesIt) {
  ClassificationResult classification_result;
  EXPECT_TRUE(number_annotator_.ClassifyText(
      UTF8ToUnicodeText("999999999999999999"), {0, 18},
      AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
 
  EXPECT_THAT(
      classification_result,
      AllOf(Field(&ClassificationResult::collection, "number"),
            Field(&ClassificationResult::numeric_value, 999999999999999999L)));
}
 
TEST_F(NumberAnnotatorTest, WhenFirstLowestNonSupportedNumberDoesNotParseIt) {
  ClassificationResult classification_result;
  EXPECT_FALSE(number_annotator_.ClassifyText(
      UTF8ToUnicodeText("-10000000000000000000"), {0, 21},
      AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
 
TEST_F(NumberAnnotatorTest, WhenFirstLargestNonSupportedNumberDoesNotParseIt) {
  ClassificationResult classification_result;
  EXPECT_FALSE(number_annotator_.ClassifyText(
      UTF8ToUnicodeText("10000000000000000000"), {0, 20},
      AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
 
TEST_F(NumberAnnotatorTest, WhenLargeNumberDoesNotParseIt) {
  ClassificationResult classification_result;
  EXPECT_FALSE(number_annotator_.ClassifyText(
      UTF8ToUnicodeText("1234567890123456789012345678901234567890"), {0, 40},
      AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
 
TEST_F(NumberAnnotatorTest, WhenMultipleMinusSignsDoesNotParseIt) {
  ClassificationResult classification_result;
  EXPECT_FALSE(number_annotator_.ClassifyText(
      UTF8ToUnicodeText("--10"), {0, 4},
      AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
 
TEST_F(NumberAnnotatorTest, WhenMinusSignSuffixDoesNotParseIt) {
  ClassificationResult classification_result;
  EXPECT_FALSE(number_annotator_.ClassifyText(
      UTF8ToUnicodeText("10-"), {0, 3},
      AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
 
TEST_F(NumberAnnotatorTest, WhenMinusInTheMiddleDoesNotParseIt) {
  ClassificationResult classification_result;
  EXPECT_FALSE(number_annotator_.ClassifyText(
      UTF8ToUnicodeText("2016-2017"), {0, 9},
      AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
 
TEST_F(NumberAnnotatorTest, WhenSuffixWithoutNumberDoesNotParseIt) {
  std::vector<AnnotatedSpan> result;
  EXPECT_TRUE(number_annotator_.FindAll(
      UTF8ToUnicodeText("... % ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW,
      &result));
 
  ASSERT_EQ(result.size(), 0);
}
 
TEST_F(NumberAnnotatorTest, WhenPrefixWithoutNumberDoesNotParseIt) {
  std::vector<AnnotatedSpan> result;
  EXPECT_TRUE(number_annotator_.FindAll(
      UTF8ToUnicodeText("... $ ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW,
      &result));
 
  ASSERT_EQ(result.size(), 0);
}
 
TEST_F(NumberAnnotatorTest, WhenPrefixAndSuffixWithoutNumberDoesNotParseIt) {
  std::vector<AnnotatedSpan> result;
  EXPECT_TRUE(number_annotator_.FindAll(
      UTF8ToUnicodeText("... $% ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW,
      &result));
 
  ASSERT_EQ(result.size(), 0);
}
 
}  // namespace
}  // namespace libtextclassifier3