From b1d2870e56631c6583ca27f2a3edcd3d980922ef Mon Sep 17 00:00:00 2001 From: Rainer Kottenhoff Date: Thu, 10 Jan 2019 22:14:48 +0100 Subject: [PATCH] + upd: merge current CED dev --- ced/ced/compact_enc_det/compact_enc_det.cc | 61 ++++++------------- .../compact_enc_det_hint_code.cc | 12 +--- 2 files changed, 20 insertions(+), 53 deletions(-) diff --git a/ced/ced/compact_enc_det/compact_enc_det.cc b/ced/ced/compact_enc_det/compact_enc_det.cc index 02cd69454..1859d42a8 100644 --- a/ced/ced/compact_enc_det/compact_enc_det.cc +++ b/ced/ced/compact_enc_det/compact_enc_det.cc @@ -1,4 +1,4 @@ -// Copyright 2016 Google Inc. +// Copyright 2016 Google Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -88,19 +88,19 @@ using std::string; // Norbert Runge has noted these words in CP1252 that are mistakenly identified // as UTF-8 because of the last pair of characters: -// NESTLÉ® 0xC9 0xAE U+00C9 U+00AE C9AE = U+026E;SMALL LEZH -// drauß\u2019 0xDF 0x92 U+00DF U+2019 DF92 = U+07D2;NKO LETTER N -// Mutterschoß\u201c 0xDF 0x93 U+00DF U+201C DF93 = U+07D3;NKO LETTER BA -// Schoß\u201c 0xDF 0x93 U+00DF U+201C -// weiß\u201c 0xDF 0x93 U+00DF U+00AB -// Schnellfuß\u201c 0xDF 0x93 U+00DF U+201C -// süß« 0xDF 0xAB U+00DF U+00AB DFAB = U+07EB;NKO HIGH TONE +// NESTLÉ® 0xC9 0xAE U+00C9 U+00AE C9AE = U+026E;SMALL LEZH +// drauß\u2019 0xDF 0x92 U+00DF U+2019 DF92 = U+07D2;NKO LETTER N +// Mutterschoß\u201c 0xDF 0x93 U+00DF U+201C DF93 = U+07D3;NKO LETTER BA +// Schoß\u201c 0xDF 0x93 U+00DF U+201C +// weiß\u201c 0xDF 0x93 U+00DF U+00AB +// Schnellfuß\u201c 0xDF 0x93 U+00DF U+201C +// süß« 0xDF 0xAB U+00DF U+00AB DFAB = U+07EB;NKO HIGH TONE // These four byte combinations now explicitly boost Latin1/CP1252. // And for reference, here are a couple of Portuguese spellings // that may be mistaken as double-byte encodings. -// informações 0xE7 0xF5 -// traição 0xE7 0xE3 +// informações 0xE7 0xF5 +// traição 0xE7 0xE3 static const char* kVersion = "2.2"; @@ -198,9 +198,6 @@ static const int kGentlePairWhack = 2 * XLOG2; // bits of whack static const int kGentlePairBoost = 2 * XLOG2; // bits of boost // for well-formed sequence -static const int kBoostPerB64Byte = 2 * XLOG2; // bits of boost for - // one good pair in Hz, etc. - static const int kDeclaredEncBoost = 5 * XDECILOG2; // bits/10 of boost for // best declared encoding per bigram @@ -1882,10 +1879,9 @@ int ApplyUILanguageHint(const Language language_hint, } // Apply initial probability hint based on corpus type (web, email, etc) -// Weight is 0..100 percent IGNORED // Return 1 if name match found int ApplyDefaultHint(const CompactEncDet::TextCorpusType corpus_type, - int weight, DetectEncodingState* destatep) { + DetectEncodingState* destatep) { for (int i = 0; i < NUM_RANKEDENCODING; i++) { // Set the default probability @@ -1896,7 +1892,6 @@ int ApplyDefaultHint(const CompactEncDet::TextCorpusType corpus_type, if (SevenBitEncoding(kMapToEncoding[i])) { destatep->enc_prob[i] = 0; } - (void)weight; } // A little corpus distinction @@ -2103,7 +2098,7 @@ void ApplyHints(const char* url_hint, if (hint_count == 0) { destatep->looking_for_latin_trigrams = true; // Default needs trigrams destatep->declared_enc_2 = destatep->declared_enc_1; - hint_count += ApplyDefaultHint(corpus_type, 100, destatep); + hint_count += ApplyDefaultHint(corpus_type, destatep); } @@ -2563,7 +2558,7 @@ void UTF7BoostWhack(DetectEncodingState* destatep, int next_pair, uint8 byte2) { } // Boost, whack, or leave alone HZ probablilty -void HzBoostWhack(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) { +void HzBoostWhack(DetectEncodingState* destatep, uint8 byte2) { if ((byte2 == '{') || (byte2 == '}')) { Boost(destatep, F_HZ_GB_2312, kBoostOnePair); // Found ~{ or ~} } else if ((byte2 == '~') || (byte2 == '\n')) { @@ -2571,7 +2566,6 @@ void HzBoostWhack(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) { } else { Whack(destatep, F_HZ_GB_2312, kBadPairWhack); // Illegal pair } - (void)byte1; } // Boost, whack, or leave alone BINARY probablilty @@ -2806,14 +2800,6 @@ int CheckUTF8UTF8Seq(DetectEncodingState* destatep, int weightshift) { } -// boost, whack, or leave alone UTF-32 probablilty -// Expecting 0000PPxx 0000QQxx where PP mostly = QQ (UTF-32BE) -// Expecting xxPP0000 xxQQ0000 where PP mostly = QQ (UTF-32LE) -void CheckUTF32ActiveSeq(DetectEncodingState* destatep) { - (void)destatep;// Not needed - //return; -} - // We give a gentle boost for each paired SO ... SI, whack others void CheckIso2022ActiveSeq(DetectEncodingState* destatep) { int this_pair = destatep->prior_interesting_pair[OtherPair]; @@ -3059,7 +3045,7 @@ void ActiveSpecialBoostWhack(const uint8* src, DetectEncodingState* destatep) { } } else if (byte1 == '~') { // Boost, whack, or leave alone HZ probablilty - HzBoostWhack(destatep, byte1, byte2); + HzBoostWhack(destatep, byte2); if (destatep->debug_data != nullptr) { // Show Hz entry char buff[16]; @@ -3132,18 +3118,12 @@ void ActiveSpecialBoostWhack(const uint8* src, DetectEncodingState* destatep) { } // End for i // Adjust per entire-pair-span - int utf8_boost = 0; - int utf8utf8_boost = 0; if (UTF8Active(destatep)) { - utf8_boost = CheckUTF8Seq(destatep, biggest_weightshift); + CheckUTF8Seq(destatep, biggest_weightshift); } if (UTF8UTF8Active(destatep)) { - utf8utf8_boost = CheckUTF8UTF8Seq(destatep, biggest_weightshift); - } - - if (UTF1632Active(destatep)) { - CheckUTF32ActiveSeq(destatep); + CheckUTF8UTF8Seq(destatep, biggest_weightshift); } if (Iso2022Active(destatep)) { @@ -3434,7 +3414,6 @@ static const char kMapToFiveBits[256] = { #undef HU #undef Hc -static const int kTriNoneLikely = 0; static const int kTriLatin1Likely = 1; static const int kTriLatin2Likely = 2; static const int kTriLatin7Likely = 3; @@ -4558,7 +4537,6 @@ bool QuickPrintableAsciiScan(const char* text, int text_length) { } static const int kMaxScanBack = 192; -static const int kMaxScanForward = 64; // Return true if text is inside a tag or JS comment bool TextInsideTag(const uint8* isrc, const uint8* src, const uint8* srclimit) { @@ -4609,7 +4587,7 @@ bool TextInsideTag(const uint8* isrc, const uint8* src, const uint8* srclimit) { return false; } -const uint8* SkipToTagEnd(const uint8* isrc, const uint8* src, const uint8* srclimit) { +const uint8* SkipToTagEnd(const uint8* src, const uint8* srclimit) { const uint8* ss = src + 1; while (ss <= srclimit) { uint8 c = *ss++; @@ -4617,7 +4595,6 @@ const uint8* SkipToTagEnd(const uint8* isrc, const uint8* src, const uint8* srcl return ss; } } - (void)isrc; return src + 2; // Always make progress, Otherwise we get an infinite loop } @@ -5258,7 +5235,7 @@ Encoding InternalDetectEncoding( if (TextInsideTag(isrc, src, srclimitslow2)) { if (tag_text_bigram_count >= kMaxBigramsTagTitleText) { ignored_some_tag_text = true; - src = SkipToTagEnd(destate.last_pair + 2, src, srclimitslow2); + src = SkipToTagEnd(src, srclimitslow2); continue; } else { weightshift = kWeightshiftForTagTitleText; @@ -5277,7 +5254,7 @@ Encoding InternalDetectEncoding( src += exit_reason; // 1 Ascii, 2 other } else { src += exit_reason; // 1 Ascii, 2 other - //// src = SkipToTagEnd(destate.last_pair, src, srclimitslow2); + //// src = SkipToTagEnd(src, srclimitslow2); } if (pruned) { diff --git a/ced/ced/compact_enc_det/compact_enc_det_hint_code.cc b/ced/ced/compact_enc_det/compact_enc_det_hint_code.cc index 6d25d64df..227b90cf7 100644 --- a/ced/ced/compact_enc_det/compact_enc_det_hint_code.cc +++ b/ced/ced/compact_enc_det/compact_enc_det_hint_code.cc @@ -102,16 +102,6 @@ const char* MyEncodingName(Encoding enc) { } -// http://www.iana.org/assignments/character-sets says charset name is up to -// 40 bytes of any printable ASCII, but that can't be right -// when parsing HTML; at least quote is not allowed. The list -// here includes all punctuation in all registered names as of April 2006 -static const char* kWordLetters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" - "abcdefghijklmnopqrstuvwxyz" - "0123456789" - "-_.:()"; - - // Normalize ASCII string to first 4 alphabetic chars and last 4 digit chars // Letters are forced to lowercase ASCII // Used to normalize charset= values @@ -128,7 +118,7 @@ string MakeChar44(const string& str) { } } else if (kIsDigit[uc]) { if (d_ptr < 4) { - res[d_ptr + 4] = kCharsetToLowerTbl[uc]; + res[4 + d_ptr] = kCharsetToLowerTbl[uc]; } else { // Keep last 4 digits by shifting left res[4] = res[5];