+ upd: merge current CED dev

This commit is contained in:
Rainer Kottenhoff 2019-01-10 22:14:48 +01:00
parent 6b334ba85e
commit b1d2870e56
2 changed files with 20 additions and 53 deletions

View File

@ -1,4 +1,4 @@
// Copyright 2016 Google Inc.
// Copyright 2016 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@ -88,19 +88,19 @@ using std::string;
// Norbert Runge has noted these words in CP1252 that are mistakenly identified
// as UTF-8 because of the last pair of characters:
// NESTLÉ® 0xC9 0xAE U+00C9 U+00AE C9AE = U+026E;SMALL LEZH
// drauß\u2019 0xDF 0x92 U+00DF U+2019 DF92 = U+07D2;NKO LETTER N
// Mutterschoß\u201c 0xDF 0x93 U+00DF U+201C DF93 = U+07D3;NKO LETTER BA
// Schoß\u201c 0xDF 0x93 U+00DF U+201C
// weiß\u201c 0xDF 0x93 U+00DF U+00AB
// Schnellfuß\u201c 0xDF 0x93 U+00DF U+201C
// süß« 0xDF 0xAB U+00DF U+00AB DFAB = U+07EB;NKO HIGH TONE
// NESTLÉ® 0xC9 0xAE U+00C9 U+00AE C9AE = U+026E;SMALL LEZH
// drauß\u2019 0xDF 0x92 U+00DF U+2019 DF92 = U+07D2;NKO LETTER N
// Mutterschoß\u201c 0xDF 0x93 U+00DF U+201C DF93 = U+07D3;NKO LETTER BA
// Schoß\u201c 0xDF 0x93 U+00DF U+201C
// weiß\u201c 0xDF 0x93 U+00DF U+00AB
// Schnellfuß\u201c 0xDF 0x93 U+00DF U+201C
// süß« 0xDF 0xAB U+00DF U+00AB DFAB = U+07EB;NKO HIGH TONE
// These four byte combinations now explicitly boost Latin1/CP1252.
// And for reference, here are a couple of Portuguese spellings
// that may be mistaken as double-byte encodings.
// informações 0xE7 0xF5
// traição 0xE7 0xE3
// informações 0xE7 0xF5
// traição 0xE7 0xE3
static const char* kVersion = "2.2";
@ -198,9 +198,6 @@ static const int kGentlePairWhack = 2 * XLOG2; // bits of whack
static const int kGentlePairBoost = 2 * XLOG2; // bits of boost
// for well-formed sequence
static const int kBoostPerB64Byte = 2 * XLOG2; // bits of boost for
// one good pair in Hz, etc.
static const int kDeclaredEncBoost = 5 * XDECILOG2; // bits/10 of boost for
// best declared encoding per bigram
@ -1882,10 +1879,9 @@ int ApplyUILanguageHint(const Language language_hint,
}
// Apply initial probability hint based on corpus type (web, email, etc)
// Weight is 0..100 percent IGNORED
// Return 1 if name match found
int ApplyDefaultHint(const CompactEncDet::TextCorpusType corpus_type,
int weight, DetectEncodingState* destatep) {
DetectEncodingState* destatep) {
for (int i = 0; i < NUM_RANKEDENCODING; i++) {
// Set the default probability
@ -1896,7 +1892,6 @@ int ApplyDefaultHint(const CompactEncDet::TextCorpusType corpus_type,
if (SevenBitEncoding(kMapToEncoding[i])) {
destatep->enc_prob[i] = 0;
}
(void)weight;
}
// A little corpus distinction
@ -2103,7 +2098,7 @@ void ApplyHints(const char* url_hint,
if (hint_count == 0) {
destatep->looking_for_latin_trigrams = true; // Default needs trigrams
destatep->declared_enc_2 = destatep->declared_enc_1;
hint_count += ApplyDefaultHint(corpus_type, 100, destatep);
hint_count += ApplyDefaultHint(corpus_type, destatep);
}
@ -2563,7 +2558,7 @@ void UTF7BoostWhack(DetectEncodingState* destatep, int next_pair, uint8 byte2) {
}
// Boost, whack, or leave alone HZ probablilty
void HzBoostWhack(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) {
void HzBoostWhack(DetectEncodingState* destatep, uint8 byte2) {
if ((byte2 == '{') || (byte2 == '}')) {
Boost(destatep, F_HZ_GB_2312, kBoostOnePair); // Found ~{ or ~}
} else if ((byte2 == '~') || (byte2 == '\n')) {
@ -2571,7 +2566,6 @@ void HzBoostWhack(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) {
} else {
Whack(destatep, F_HZ_GB_2312, kBadPairWhack); // Illegal pair
}
(void)byte1;
}
// Boost, whack, or leave alone BINARY probablilty
@ -2806,14 +2800,6 @@ int CheckUTF8UTF8Seq(DetectEncodingState* destatep, int weightshift) {
}
// boost, whack, or leave alone UTF-32 probablilty
// Expecting 0000PPxx 0000QQxx where PP mostly = QQ (UTF-32BE)
// Expecting xxPP0000 xxQQ0000 where PP mostly = QQ (UTF-32LE)
void CheckUTF32ActiveSeq(DetectEncodingState* destatep) {
(void)destatep;// Not needed
//return;
}
// We give a gentle boost for each paired SO ... SI, whack others
void CheckIso2022ActiveSeq(DetectEncodingState* destatep) {
int this_pair = destatep->prior_interesting_pair[OtherPair];
@ -3059,7 +3045,7 @@ void ActiveSpecialBoostWhack(const uint8* src, DetectEncodingState* destatep) {
}
} else if (byte1 == '~') {
// Boost, whack, or leave alone HZ probablilty
HzBoostWhack(destatep, byte1, byte2);
HzBoostWhack(destatep, byte2);
if (destatep->debug_data != nullptr) {
// Show Hz entry
char buff[16];
@ -3132,18 +3118,12 @@ void ActiveSpecialBoostWhack(const uint8* src, DetectEncodingState* destatep) {
} // End for i
// Adjust per entire-pair-span
int utf8_boost = 0;
int utf8utf8_boost = 0;
if (UTF8Active(destatep)) {
utf8_boost = CheckUTF8Seq(destatep, biggest_weightshift);
CheckUTF8Seq(destatep, biggest_weightshift);
}
if (UTF8UTF8Active(destatep)) {
utf8utf8_boost = CheckUTF8UTF8Seq(destatep, biggest_weightshift);
}
if (UTF1632Active(destatep)) {
CheckUTF32ActiveSeq(destatep);
CheckUTF8UTF8Seq(destatep, biggest_weightshift);
}
if (Iso2022Active(destatep)) {
@ -3434,7 +3414,6 @@ static const char kMapToFiveBits[256] = {
#undef HU
#undef Hc
static const int kTriNoneLikely = 0;
static const int kTriLatin1Likely = 1;
static const int kTriLatin2Likely = 2;
static const int kTriLatin7Likely = 3;
@ -4558,7 +4537,6 @@ bool QuickPrintableAsciiScan(const char* text, int text_length) {
}
static const int kMaxScanBack = 192;
static const int kMaxScanForward = 64;
// Return true if text is inside a tag or JS comment
bool TextInsideTag(const uint8* isrc, const uint8* src, const uint8* srclimit) {
@ -4609,7 +4587,7 @@ bool TextInsideTag(const uint8* isrc, const uint8* src, const uint8* srclimit) {
return false;
}
const uint8* SkipToTagEnd(const uint8* isrc, const uint8* src, const uint8* srclimit) {
const uint8* SkipToTagEnd(const uint8* src, const uint8* srclimit) {
const uint8* ss = src + 1;
while (ss <= srclimit) {
uint8 c = *ss++;
@ -4617,7 +4595,6 @@ const uint8* SkipToTagEnd(const uint8* isrc, const uint8* src, const uint8* srcl
return ss;
}
}
(void)isrc;
return src + 2; // Always make progress, Otherwise we get an infinite loop
}
@ -5258,7 +5235,7 @@ Encoding InternalDetectEncoding(
if (TextInsideTag(isrc, src, srclimitslow2)) {
if (tag_text_bigram_count >= kMaxBigramsTagTitleText) {
ignored_some_tag_text = true;
src = SkipToTagEnd(destate.last_pair + 2, src, srclimitslow2);
src = SkipToTagEnd(src, srclimitslow2);
continue;
} else {
weightshift = kWeightshiftForTagTitleText;
@ -5277,7 +5254,7 @@ Encoding InternalDetectEncoding(
src += exit_reason; // 1 Ascii, 2 other
} else {
src += exit_reason; // 1 Ascii, 2 other
//// src = SkipToTagEnd(destate.last_pair, src, srclimitslow2);
//// src = SkipToTagEnd(src, srclimitslow2);
}
if (pruned) {

View File

@ -102,16 +102,6 @@ const char* MyEncodingName(Encoding enc) {
}
// http://www.iana.org/assignments/character-sets says charset name is up to
// 40 bytes of any printable ASCII, but that can't be right
// when parsing HTML; at least quote is not allowed. The list
// here includes all punctuation in all registered names as of April 2006
static const char* kWordLetters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz"
"0123456789"
"-_.:()";
// Normalize ASCII string to first 4 alphabetic chars and last 4 digit chars
// Letters are forced to lowercase ASCII
// Used to normalize charset= values
@ -128,7 +118,7 @@ string MakeChar44(const string& str) {
}
} else if (kIsDigit[uc]) {
if (d_ptr < 4) {
res[d_ptr + 4] = kCharsetToLowerTbl[uc];
res[4 + d_ptr] = kCharsetToLowerTbl[uc];
} else {
// Keep last 4 digits by shifting left
res[4] = res[5];