+ upd: merge current CED dev

2026-06-14 21:09:05 +08:00 · 2019-01-10 22:14:48 +01:00 · 2019-01-10 22:14:48 +01:00 · b1d2870e56
commit b1d2870e56
parent 6b334ba85e
2 changed files with 20 additions and 53 deletions
--- a/ced/ced/compact_enc_det/compact_enc_det.cc
+++ b/ced/ced/compact_enc_det/compact_enc_det.cc
@ -1,4 +1,4 @@
-// Copyright 2016 Google Inc.
+// Copyright 2016 Google Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -88,19 +88,19 @@ using std::string;

 // Norbert Runge has noted these words in CP1252 that are mistakenly identified
 // as UTF-8 because of the last pair of characters:
-//  NESTLÃ‰Â®               0xC9 0xAE U+00C9 U+00AE   C9AE = U+026E;SMALL LEZH
-//  drauÃŸ\u2019           0xDF 0x92 U+00DF U+2019   DF92 = U+07D2;NKO LETTER N
-//  MutterschoÃŸ\u201c     0xDF 0x93 U+00DF U+201C   DF93 = U+07D3;NKO LETTER BA
-//  SchoÃŸ\u201c           0xDF 0x93 U+00DF U+201C
-//  weiÃŸ\u201c            0xDF 0x93 U+00DF U+00AB
-//  SchnellfuÃŸ\u201c      0xDF 0x93 U+00DF U+201C
-//  sÃ¼ÃŸÂ«                  0xDF 0xAB U+00DF U+00AB   DFAB = U+07EB;NKO HIGH TONE
+//  NESTLÉ®               0xC9 0xAE U+00C9 U+00AE   C9AE = U+026E;SMALL LEZH
+//  drauß\u2019           0xDF 0x92 U+00DF U+2019   DF92 = U+07D2;NKO LETTER N
+//  Mutterschoß\u201c     0xDF 0x93 U+00DF U+201C   DF93 = U+07D3;NKO LETTER BA
+//  Schoß\u201c           0xDF 0x93 U+00DF U+201C
+//  weiß\u201c            0xDF 0x93 U+00DF U+00AB
+//  Schnellfuß\u201c      0xDF 0x93 U+00DF U+201C
+//  süß«                  0xDF 0xAB U+00DF U+00AB   DFAB = U+07EB;NKO HIGH TONE
 // These four byte combinations now explicitly boost Latin1/CP1252.

 // And for reference, here are a couple of Portuguese spellings
 // that may be mistaken as double-byte encodings.
-//   informaÃ§Ãµes          0xE7 0xF5
-//   traiÃ§Ã£o              0xE7 0xE3
+//   informações          0xE7 0xF5
+//   traição              0xE7 0xE3


 static const char* kVersion = "2.2";
@ -198,9 +198,6 @@ static const int kGentlePairWhack = 2 * XLOG2;       // bits of whack
 static const int kGentlePairBoost = 2 * XLOG2;       // bits of boost
                                            // for well-formed sequence

-static const int kBoostPerB64Byte = 2 * XLOG2;       // bits of boost for
-                                            // one good pair in Hz, etc.
-
 static const int kDeclaredEncBoost = 5 * XDECILOG2;  // bits/10 of boost for
                                            // best declared encoding per bigram

@ -1882,10 +1879,9 @@ int ApplyUILanguageHint(const Language language_hint,
 }

 // Apply initial probability hint based on corpus type (web, email, etc)
-// Weight is 0..100 percent IGNORED
 // Return 1 if name match found
 int ApplyDefaultHint(const CompactEncDet::TextCorpusType corpus_type,
-                      int weight, DetectEncodingState* destatep) {
+                      DetectEncodingState* destatep) {

  for (int i = 0; i < NUM_RANKEDENCODING; i++) {
    // Set the default probability
@ -1896,7 +1892,6 @@ int ApplyDefaultHint(const CompactEncDet::TextCorpusType corpus_type,
    if (SevenBitEncoding(kMapToEncoding[i])) {
      destatep->enc_prob[i] = 0;
    }
-    (void)weight;
  }

  //  A little corpus distinction
@ -2103,7 +2098,7 @@ void ApplyHints(const char* url_hint,
  if (hint_count == 0) {
    destatep->looking_for_latin_trigrams = true;    // Default needs trigrams
    destatep->declared_enc_2 = destatep->declared_enc_1;
-    hint_count += ApplyDefaultHint(corpus_type, 100, destatep);
+    hint_count += ApplyDefaultHint(corpus_type, destatep);
  }


@ -2563,7 +2558,7 @@ void UTF7BoostWhack(DetectEncodingState* destatep, int next_pair, uint8 byte2) {
 }

 // Boost, whack, or leave alone HZ probablilty
-void HzBoostWhack(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) {
+void HzBoostWhack(DetectEncodingState* destatep, uint8 byte2) {
  if ((byte2 == '{') || (byte2 == '}')) {
    Boost(destatep, F_HZ_GB_2312, kBoostOnePair);         // Found ~{ or ~}
  } else if ((byte2 == '~') || (byte2 == '\n')) {
@ -2571,7 +2566,6 @@ void HzBoostWhack(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) {
  } else {
    Whack(destatep, F_HZ_GB_2312, kBadPairWhack);         // Illegal pair
  }
-  (void)byte1;
 }

 // Boost, whack, or leave alone BINARY probablilty
@ -2806,14 +2800,6 @@ int CheckUTF8UTF8Seq(DetectEncodingState* destatep, int weightshift) {
 }


-// boost, whack, or leave alone UTF-32 probablilty
-// Expecting 0000PPxx 0000QQxx where PP mostly = QQ (UTF-32BE)
-// Expecting xxPP0000 xxQQ0000 where PP mostly = QQ (UTF-32LE)
-void CheckUTF32ActiveSeq(DetectEncodingState* destatep) {
-  (void)destatep;// Not needed
-  //return;
-}
-
 // We give a gentle boost for each paired SO ... SI, whack others
 void CheckIso2022ActiveSeq(DetectEncodingState* destatep) {
  int this_pair = destatep->prior_interesting_pair[OtherPair];
@ -3059,7 +3045,7 @@ void ActiveSpecialBoostWhack(const uint8* src, DetectEncodingState* destatep) {
        }
      } else if (byte1 == '~') {
        // Boost, whack, or leave alone HZ probablilty
-        HzBoostWhack(destatep, byte1, byte2);
+        HzBoostWhack(destatep, byte2);
        if (destatep->debug_data != nullptr) {
          // Show Hz entry
          char buff[16];
@ -3132,18 +3118,12 @@ void ActiveSpecialBoostWhack(const uint8* src, DetectEncodingState* destatep) {
    }         // End for i

    // Adjust per entire-pair-span
-    int utf8_boost = 0;
-    int utf8utf8_boost = 0;
    if (UTF8Active(destatep)) {
-      utf8_boost = CheckUTF8Seq(destatep, biggest_weightshift);
+      CheckUTF8Seq(destatep, biggest_weightshift);
    }

    if (UTF8UTF8Active(destatep)) {
-      utf8utf8_boost = CheckUTF8UTF8Seq(destatep, biggest_weightshift);
-    }
-
-    if (UTF1632Active(destatep)) {
-      CheckUTF32ActiveSeq(destatep);
+      CheckUTF8UTF8Seq(destatep, biggest_weightshift);
    }

    if (Iso2022Active(destatep)) {
@ -3434,7 +3414,6 @@ static const char kMapToFiveBits[256] = {
 #undef HU
 #undef Hc

-static const int kTriNoneLikely = 0;
 static const int kTriLatin1Likely = 1;
 static const int kTriLatin2Likely = 2;
 static const int kTriLatin7Likely = 3;
@ -4558,7 +4537,6 @@ bool QuickPrintableAsciiScan(const char* text, int text_length) {
 }

 static const int kMaxScanBack = 192;
-static const int kMaxScanForward = 64;

 // Return true if text is inside a tag or JS comment
 bool TextInsideTag(const uint8* isrc, const uint8* src, const uint8* srclimit) {
@ -4609,7 +4587,7 @@ bool TextInsideTag(const uint8* isrc, const uint8* src, const uint8* srclimit) {
  return false;
 }

-const uint8* SkipToTagEnd(const uint8* isrc, const uint8* src, const uint8* srclimit) {
+const uint8* SkipToTagEnd(const uint8* src, const uint8* srclimit) {
  const uint8* ss = src + 1;
  while (ss <= srclimit) {
    uint8 c = *ss++;
@ -4617,7 +4595,6 @@ const uint8* SkipToTagEnd(const uint8* isrc, const uint8* src, const uint8* srcl
      return ss;
    }
  }
-  (void)isrc;
  return src + 2;     // Always make progress, Otherwise we get an infinite loop
 }

@ -5258,7 +5235,7 @@ Encoding InternalDetectEncoding(
        if (TextInsideTag(isrc, src, srclimitslow2)) {
          if (tag_text_bigram_count >= kMaxBigramsTagTitleText) {
            ignored_some_tag_text = true;
-            src = SkipToTagEnd(destate.last_pair + 2, src, srclimitslow2);
+            src = SkipToTagEnd(src, srclimitslow2);
            continue;
          } else {
            weightshift = kWeightshiftForTagTitleText;
@ -5277,7 +5254,7 @@ Encoding InternalDetectEncoding(
        src += exit_reason;               // 1 Ascii, 2 other
      } else {
        src += exit_reason;               // 1 Ascii, 2 other
-        //// src = SkipToTagEnd(destate.last_pair, src, srclimitslow2);
+        //// src = SkipToTagEnd(src, srclimitslow2);
      }

      if (pruned) {
--- a/ced/ced/compact_enc_det/compact_enc_det_hint_code.cc
+++ b/ced/ced/compact_enc_det/compact_enc_det_hint_code.cc
@ -102,16 +102,6 @@ const char* MyEncodingName(Encoding enc) {
 }


-// http://www.iana.org/assignments/character-sets says charset name is up to
-// 40 bytes of any printable ASCII, but that can't be right
-// when parsing HTML; at least quote is not allowed. The list
-// here includes all punctuation in all registered names as of April 2006
-static const char* kWordLetters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-                                  "abcdefghijklmnopqrstuvwxyz"
-                                  "0123456789"
-                                  "-_.:()";
-
-
 // Normalize ASCII string to first 4 alphabetic chars and last 4 digit chars
 // Letters are forced to lowercase ASCII
 // Used to normalize charset= values
@ -128,7 +118,7 @@ string MakeChar44(const string& str) {
      }
    } else if (kIsDigit[uc]) {
      if (d_ptr < 4) {
-        res[d_ptr + 4] = kCharsetToLowerTbl[uc];
+        res[4 + d_ptr] = kCharsetToLowerTbl[uc];
      } else {
        // Keep last 4 digits by shifting left
        res[4] = res[5];