From b1d2870e56631c6583ca27f2a3edcd3d980922ef Mon Sep 17 00:00:00 2001
From: Rainer Kottenhoff <rainer.kottenhoff@gmail.com>
Date: Thu, 10 Jan 2019 22:14:48 +0100
Subject: [PATCH] + upd: merge current CED dev

---
 ced/ced/compact_enc_det/compact_enc_det.cc    | 61 ++++++-------------
 .../compact_enc_det_hint_code.cc              | 12 +---
 2 files changed, 20 insertions(+), 53 deletions(-)

diff --git a/ced/ced/compact_enc_det/compact_enc_det.cc b/ced/ced/compact_enc_det/compact_enc_det.cc
index 02cd69454..1859d42a8 100644
--- a/ced/ced/compact_enc_det/compact_enc_det.cc
+++ b/ced/ced/compact_enc_det/compact_enc_det.cc
@@ -1,4 +1,4 @@
-﻿// Copyright 2016 Google Inc.
+// Copyright 2016 Google Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -88,19 +88,19 @@ using std::string;
 
 // Norbert Runge has noted these words in CP1252 that are mistakenly identified
 // as UTF-8 because of the last pair of characters:
-//  NESTLÃ‰Â®               0xC9 0xAE U+00C9 U+00AE   C9AE = U+026E;SMALL LEZH
-//  drauÃŸ\u2019           0xDF 0x92 U+00DF U+2019   DF92 = U+07D2;NKO LETTER N
-//  MutterschoÃŸ\u201c     0xDF 0x93 U+00DF U+201C   DF93 = U+07D3;NKO LETTER BA
-//  SchoÃŸ\u201c           0xDF 0x93 U+00DF U+201C
-//  weiÃŸ\u201c            0xDF 0x93 U+00DF U+00AB
-//  SchnellfuÃŸ\u201c      0xDF 0x93 U+00DF U+201C
-//  sÃ¼ÃŸÂ«                  0xDF 0xAB U+00DF U+00AB   DFAB = U+07EB;NKO HIGH TONE
+//  NESTLÉ®               0xC9 0xAE U+00C9 U+00AE   C9AE = U+026E;SMALL LEZH
+//  drauß\u2019           0xDF 0x92 U+00DF U+2019   DF92 = U+07D2;NKO LETTER N
+//  Mutterschoß\u201c     0xDF 0x93 U+00DF U+201C   DF93 = U+07D3;NKO LETTER BA
+//  Schoß\u201c           0xDF 0x93 U+00DF U+201C
+//  weiß\u201c            0xDF 0x93 U+00DF U+00AB
+//  Schnellfuß\u201c      0xDF 0x93 U+00DF U+201C
+//  süß«                  0xDF 0xAB U+00DF U+00AB   DFAB = U+07EB;NKO HIGH TONE
 // These four byte combinations now explicitly boost Latin1/CP1252.
 
 // And for reference, here are a couple of Portuguese spellings
 // that may be mistaken as double-byte encodings.
-//   informaÃ§Ãµes          0xE7 0xF5
-//   traiÃ§Ã£o              0xE7 0xE3
+//   informações          0xE7 0xF5
+//   traição              0xE7 0xE3
 
 
 static const char* kVersion = "2.2";
@@ -198,9 +198,6 @@ static const int kGentlePairWhack = 2 * XLOG2;       // bits of whack
 static const int kGentlePairBoost = 2 * XLOG2;       // bits of boost
                                             // for well-formed sequence
 
-static const int kBoostPerB64Byte = 2 * XLOG2;       // bits of boost for
-                                            // one good pair in Hz, etc.
-
 static const int kDeclaredEncBoost = 5 * XDECILOG2;  // bits/10 of boost for
                                             // best declared encoding per bigram
 
@@ -1882,10 +1879,9 @@ int ApplyUILanguageHint(const Language language_hint,
 }
 
 // Apply initial probability hint based on corpus type (web, email, etc)
-// Weight is 0..100 percent IGNORED
 // Return 1 if name match found
 int ApplyDefaultHint(const CompactEncDet::TextCorpusType corpus_type,
-                      int weight, DetectEncodingState* destatep) {
+                      DetectEncodingState* destatep) {
 
   for (int i = 0; i < NUM_RANKEDENCODING; i++) {
     // Set the default probability
@@ -1896,7 +1892,6 @@ int ApplyDefaultHint(const CompactEncDet::TextCorpusType corpus_type,
     if (SevenBitEncoding(kMapToEncoding[i])) {
       destatep->enc_prob[i] = 0;
     }
-    (void)weight;
   }
 
   //  A little corpus distinction
@@ -2103,7 +2098,7 @@ void ApplyHints(const char* url_hint,
   if (hint_count == 0) {
     destatep->looking_for_latin_trigrams = true;    // Default needs trigrams
     destatep->declared_enc_2 = destatep->declared_enc_1;
-    hint_count += ApplyDefaultHint(corpus_type, 100, destatep);
+    hint_count += ApplyDefaultHint(corpus_type, destatep);
   }
 
 
@@ -2563,7 +2558,7 @@ void UTF7BoostWhack(DetectEncodingState* destatep, int next_pair, uint8 byte2) {
 }
 
 // Boost, whack, or leave alone HZ probablilty
-void HzBoostWhack(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) {
+void HzBoostWhack(DetectEncodingState* destatep, uint8 byte2) {
   if ((byte2 == '{') || (byte2 == '}')) {
     Boost(destatep, F_HZ_GB_2312, kBoostOnePair);         // Found ~{ or ~}
   } else if ((byte2 == '~') || (byte2 == '\n')) {
@@ -2571,7 +2566,6 @@ void HzBoostWhack(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) {
   } else {
     Whack(destatep, F_HZ_GB_2312, kBadPairWhack);         // Illegal pair
   }
-  (void)byte1;
 }
 
 // Boost, whack, or leave alone BINARY probablilty
@@ -2806,14 +2800,6 @@ int CheckUTF8UTF8Seq(DetectEncodingState* destatep, int weightshift) {
 }
 
 
-// boost, whack, or leave alone UTF-32 probablilty
-// Expecting 0000PPxx 0000QQxx where PP mostly = QQ (UTF-32BE)
-// Expecting xxPP0000 xxQQ0000 where PP mostly = QQ (UTF-32LE)
-void CheckUTF32ActiveSeq(DetectEncodingState* destatep) {
-  (void)destatep;// Not needed
-  //return;
-}
-
 // We give a gentle boost for each paired SO ... SI, whack others
 void CheckIso2022ActiveSeq(DetectEncodingState* destatep) {
   int this_pair = destatep->prior_interesting_pair[OtherPair];
@@ -3059,7 +3045,7 @@ void ActiveSpecialBoostWhack(const uint8* src, DetectEncodingState* destatep) {
         }
       } else if (byte1 == '~') {
         // Boost, whack, or leave alone HZ probablilty
-        HzBoostWhack(destatep, byte1, byte2);
+        HzBoostWhack(destatep, byte2);
         if (destatep->debug_data != nullptr) {
           // Show Hz entry
           char buff[16];
@@ -3132,18 +3118,12 @@ void ActiveSpecialBoostWhack(const uint8* src, DetectEncodingState* destatep) {
     }         // End for i
 
     // Adjust per entire-pair-span
-    int utf8_boost = 0;
-    int utf8utf8_boost = 0;
     if (UTF8Active(destatep)) {
-      utf8_boost = CheckUTF8Seq(destatep, biggest_weightshift);
+      CheckUTF8Seq(destatep, biggest_weightshift);
     }
 
     if (UTF8UTF8Active(destatep)) {
-      utf8utf8_boost = CheckUTF8UTF8Seq(destatep, biggest_weightshift);
-    }
-
-    if (UTF1632Active(destatep)) {
-      CheckUTF32ActiveSeq(destatep);
+      CheckUTF8UTF8Seq(destatep, biggest_weightshift);
     }
 
     if (Iso2022Active(destatep)) {
@@ -3434,7 +3414,6 @@ static const char kMapToFiveBits[256] = {
 #undef HU
 #undef Hc
 
-static const int kTriNoneLikely = 0;
 static const int kTriLatin1Likely = 1;
 static const int kTriLatin2Likely = 2;
 static const int kTriLatin7Likely = 3;
@@ -4558,7 +4537,6 @@ bool QuickPrintableAsciiScan(const char* text, int text_length) {
 }
 
 static const int kMaxScanBack = 192;
-static const int kMaxScanForward = 64;
 
 // Return true if text is inside a tag or JS comment
 bool TextInsideTag(const uint8* isrc, const uint8* src, const uint8* srclimit) {
@@ -4609,7 +4587,7 @@ bool TextInsideTag(const uint8* isrc, const uint8* src, const uint8* srclimit) {
   return false;
 }
 
-const uint8* SkipToTagEnd(const uint8* isrc, const uint8* src, const uint8* srclimit) {
+const uint8* SkipToTagEnd(const uint8* src, const uint8* srclimit) {
   const uint8* ss = src + 1;
   while (ss <= srclimit) {
     uint8 c = *ss++;
@@ -4617,7 +4595,6 @@ const uint8* SkipToTagEnd(const uint8* isrc, const uint8* src, const uint8* srcl
       return ss;
     }
   }
-  (void)isrc;
   return src + 2;     // Always make progress, Otherwise we get an infinite loop
 }
 
@@ -5258,7 +5235,7 @@ Encoding InternalDetectEncoding(
         if (TextInsideTag(isrc, src, srclimitslow2)) {
           if (tag_text_bigram_count >= kMaxBigramsTagTitleText) {
             ignored_some_tag_text = true;
-            src = SkipToTagEnd(destate.last_pair + 2, src, srclimitslow2);
+            src = SkipToTagEnd(src, srclimitslow2);
             continue;
           } else {
             weightshift = kWeightshiftForTagTitleText;
@@ -5277,7 +5254,7 @@ Encoding InternalDetectEncoding(
         src += exit_reason;               // 1 Ascii, 2 other
       } else {
         src += exit_reason;               // 1 Ascii, 2 other
-        //// src = SkipToTagEnd(destate.last_pair, src, srclimitslow2);
+        //// src = SkipToTagEnd(src, srclimitslow2);
       }
 
       if (pruned) {
diff --git a/ced/ced/compact_enc_det/compact_enc_det_hint_code.cc b/ced/ced/compact_enc_det/compact_enc_det_hint_code.cc
index 6d25d64df..227b90cf7 100644
--- a/ced/ced/compact_enc_det/compact_enc_det_hint_code.cc
+++ b/ced/ced/compact_enc_det/compact_enc_det_hint_code.cc
@@ -102,16 +102,6 @@ const char* MyEncodingName(Encoding enc) {
 }
 
 
-// http://www.iana.org/assignments/character-sets says charset name is up to
-// 40 bytes of any printable ASCII, but that can't be right
-// when parsing HTML; at least quote is not allowed. The list
-// here includes all punctuation in all registered names as of April 2006
-static const char* kWordLetters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-                                  "abcdefghijklmnopqrstuvwxyz"
-                                  "0123456789"
-                                  "-_.:()";
-
-
 // Normalize ASCII string to first 4 alphabetic chars and last 4 digit chars
 // Letters are forced to lowercase ASCII
 // Used to normalize charset= values
@@ -128,7 +118,7 @@ string MakeChar44(const string& str) {
       }
     } else if (kIsDigit[uc]) {
       if (d_ptr < 4) {
-        res[d_ptr + 4] = kCharsetToLowerTbl[uc];
+        res[4 + d_ptr] = kCharsetToLowerTbl[uc];
       } else {
         // Keep last 4 digits by shifting left
         res[4] = res[5];