mirror of
https://github.com/rizonesoft/Notepad3.git
synced 2026-06-14 21:09:05 +08:00
+ upd: merge current CED dev
This commit is contained in:
parent
6b334ba85e
commit
b1d2870e56
@ -1,4 +1,4 @@
|
||||
// Copyright 2016 Google Inc.
|
||||
// Copyright 2016 Google Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
@ -88,19 +88,19 @@ using std::string;
|
||||
|
||||
// Norbert Runge has noted these words in CP1252 that are mistakenly identified
|
||||
// as UTF-8 because of the last pair of characters:
|
||||
// NESTLÉ® 0xC9 0xAE U+00C9 U+00AE C9AE = U+026E;SMALL LEZH
|
||||
// drauß\u2019 0xDF 0x92 U+00DF U+2019 DF92 = U+07D2;NKO LETTER N
|
||||
// Mutterschoß\u201c 0xDF 0x93 U+00DF U+201C DF93 = U+07D3;NKO LETTER BA
|
||||
// Schoß\u201c 0xDF 0x93 U+00DF U+201C
|
||||
// weiß\u201c 0xDF 0x93 U+00DF U+00AB
|
||||
// Schnellfuß\u201c 0xDF 0x93 U+00DF U+201C
|
||||
// süß« 0xDF 0xAB U+00DF U+00AB DFAB = U+07EB;NKO HIGH TONE
|
||||
// NESTLÉ® 0xC9 0xAE U+00C9 U+00AE C9AE = U+026E;SMALL LEZH
|
||||
// drauß\u2019 0xDF 0x92 U+00DF U+2019 DF92 = U+07D2;NKO LETTER N
|
||||
// Mutterschoß\u201c 0xDF 0x93 U+00DF U+201C DF93 = U+07D3;NKO LETTER BA
|
||||
// Schoß\u201c 0xDF 0x93 U+00DF U+201C
|
||||
// weiß\u201c 0xDF 0x93 U+00DF U+00AB
|
||||
// Schnellfuß\u201c 0xDF 0x93 U+00DF U+201C
|
||||
// süß« 0xDF 0xAB U+00DF U+00AB DFAB = U+07EB;NKO HIGH TONE
|
||||
// These four byte combinations now explicitly boost Latin1/CP1252.
|
||||
|
||||
// And for reference, here are a couple of Portuguese spellings
|
||||
// that may be mistaken as double-byte encodings.
|
||||
// informações 0xE7 0xF5
|
||||
// traição 0xE7 0xE3
|
||||
// informações 0xE7 0xF5
|
||||
// traição 0xE7 0xE3
|
||||
|
||||
|
||||
static const char* kVersion = "2.2";
|
||||
@ -198,9 +198,6 @@ static const int kGentlePairWhack = 2 * XLOG2; // bits of whack
|
||||
static const int kGentlePairBoost = 2 * XLOG2; // bits of boost
|
||||
// for well-formed sequence
|
||||
|
||||
static const int kBoostPerB64Byte = 2 * XLOG2; // bits of boost for
|
||||
// one good pair in Hz, etc.
|
||||
|
||||
static const int kDeclaredEncBoost = 5 * XDECILOG2; // bits/10 of boost for
|
||||
// best declared encoding per bigram
|
||||
|
||||
@ -1882,10 +1879,9 @@ int ApplyUILanguageHint(const Language language_hint,
|
||||
}
|
||||
|
||||
// Apply initial probability hint based on corpus type (web, email, etc)
|
||||
// Weight is 0..100 percent IGNORED
|
||||
// Return 1 if name match found
|
||||
int ApplyDefaultHint(const CompactEncDet::TextCorpusType corpus_type,
|
||||
int weight, DetectEncodingState* destatep) {
|
||||
DetectEncodingState* destatep) {
|
||||
|
||||
for (int i = 0; i < NUM_RANKEDENCODING; i++) {
|
||||
// Set the default probability
|
||||
@ -1896,7 +1892,6 @@ int ApplyDefaultHint(const CompactEncDet::TextCorpusType corpus_type,
|
||||
if (SevenBitEncoding(kMapToEncoding[i])) {
|
||||
destatep->enc_prob[i] = 0;
|
||||
}
|
||||
(void)weight;
|
||||
}
|
||||
|
||||
// A little corpus distinction
|
||||
@ -2103,7 +2098,7 @@ void ApplyHints(const char* url_hint,
|
||||
if (hint_count == 0) {
|
||||
destatep->looking_for_latin_trigrams = true; // Default needs trigrams
|
||||
destatep->declared_enc_2 = destatep->declared_enc_1;
|
||||
hint_count += ApplyDefaultHint(corpus_type, 100, destatep);
|
||||
hint_count += ApplyDefaultHint(corpus_type, destatep);
|
||||
}
|
||||
|
||||
|
||||
@ -2563,7 +2558,7 @@ void UTF7BoostWhack(DetectEncodingState* destatep, int next_pair, uint8 byte2) {
|
||||
}
|
||||
|
||||
// Boost, whack, or leave alone HZ probablilty
|
||||
void HzBoostWhack(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) {
|
||||
void HzBoostWhack(DetectEncodingState* destatep, uint8 byte2) {
|
||||
if ((byte2 == '{') || (byte2 == '}')) {
|
||||
Boost(destatep, F_HZ_GB_2312, kBoostOnePair); // Found ~{ or ~}
|
||||
} else if ((byte2 == '~') || (byte2 == '\n')) {
|
||||
@ -2571,7 +2566,6 @@ void HzBoostWhack(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) {
|
||||
} else {
|
||||
Whack(destatep, F_HZ_GB_2312, kBadPairWhack); // Illegal pair
|
||||
}
|
||||
(void)byte1;
|
||||
}
|
||||
|
||||
// Boost, whack, or leave alone BINARY probablilty
|
||||
@ -2806,14 +2800,6 @@ int CheckUTF8UTF8Seq(DetectEncodingState* destatep, int weightshift) {
|
||||
}
|
||||
|
||||
|
||||
// boost, whack, or leave alone UTF-32 probablilty
|
||||
// Expecting 0000PPxx 0000QQxx where PP mostly = QQ (UTF-32BE)
|
||||
// Expecting xxPP0000 xxQQ0000 where PP mostly = QQ (UTF-32LE)
|
||||
void CheckUTF32ActiveSeq(DetectEncodingState* destatep) {
|
||||
(void)destatep;// Not needed
|
||||
//return;
|
||||
}
|
||||
|
||||
// We give a gentle boost for each paired SO ... SI, whack others
|
||||
void CheckIso2022ActiveSeq(DetectEncodingState* destatep) {
|
||||
int this_pair = destatep->prior_interesting_pair[OtherPair];
|
||||
@ -3059,7 +3045,7 @@ void ActiveSpecialBoostWhack(const uint8* src, DetectEncodingState* destatep) {
|
||||
}
|
||||
} else if (byte1 == '~') {
|
||||
// Boost, whack, or leave alone HZ probablilty
|
||||
HzBoostWhack(destatep, byte1, byte2);
|
||||
HzBoostWhack(destatep, byte2);
|
||||
if (destatep->debug_data != nullptr) {
|
||||
// Show Hz entry
|
||||
char buff[16];
|
||||
@ -3132,18 +3118,12 @@ void ActiveSpecialBoostWhack(const uint8* src, DetectEncodingState* destatep) {
|
||||
} // End for i
|
||||
|
||||
// Adjust per entire-pair-span
|
||||
int utf8_boost = 0;
|
||||
int utf8utf8_boost = 0;
|
||||
if (UTF8Active(destatep)) {
|
||||
utf8_boost = CheckUTF8Seq(destatep, biggest_weightshift);
|
||||
CheckUTF8Seq(destatep, biggest_weightshift);
|
||||
}
|
||||
|
||||
if (UTF8UTF8Active(destatep)) {
|
||||
utf8utf8_boost = CheckUTF8UTF8Seq(destatep, biggest_weightshift);
|
||||
}
|
||||
|
||||
if (UTF1632Active(destatep)) {
|
||||
CheckUTF32ActiveSeq(destatep);
|
||||
CheckUTF8UTF8Seq(destatep, biggest_weightshift);
|
||||
}
|
||||
|
||||
if (Iso2022Active(destatep)) {
|
||||
@ -3434,7 +3414,6 @@ static const char kMapToFiveBits[256] = {
|
||||
#undef HU
|
||||
#undef Hc
|
||||
|
||||
static const int kTriNoneLikely = 0;
|
||||
static const int kTriLatin1Likely = 1;
|
||||
static const int kTriLatin2Likely = 2;
|
||||
static const int kTriLatin7Likely = 3;
|
||||
@ -4558,7 +4537,6 @@ bool QuickPrintableAsciiScan(const char* text, int text_length) {
|
||||
}
|
||||
|
||||
static const int kMaxScanBack = 192;
|
||||
static const int kMaxScanForward = 64;
|
||||
|
||||
// Return true if text is inside a tag or JS comment
|
||||
bool TextInsideTag(const uint8* isrc, const uint8* src, const uint8* srclimit) {
|
||||
@ -4609,7 +4587,7 @@ bool TextInsideTag(const uint8* isrc, const uint8* src, const uint8* srclimit) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const uint8* SkipToTagEnd(const uint8* isrc, const uint8* src, const uint8* srclimit) {
|
||||
const uint8* SkipToTagEnd(const uint8* src, const uint8* srclimit) {
|
||||
const uint8* ss = src + 1;
|
||||
while (ss <= srclimit) {
|
||||
uint8 c = *ss++;
|
||||
@ -4617,7 +4595,6 @@ const uint8* SkipToTagEnd(const uint8* isrc, const uint8* src, const uint8* srcl
|
||||
return ss;
|
||||
}
|
||||
}
|
||||
(void)isrc;
|
||||
return src + 2; // Always make progress, Otherwise we get an infinite loop
|
||||
}
|
||||
|
||||
@ -5258,7 +5235,7 @@ Encoding InternalDetectEncoding(
|
||||
if (TextInsideTag(isrc, src, srclimitslow2)) {
|
||||
if (tag_text_bigram_count >= kMaxBigramsTagTitleText) {
|
||||
ignored_some_tag_text = true;
|
||||
src = SkipToTagEnd(destate.last_pair + 2, src, srclimitslow2);
|
||||
src = SkipToTagEnd(src, srclimitslow2);
|
||||
continue;
|
||||
} else {
|
||||
weightshift = kWeightshiftForTagTitleText;
|
||||
@ -5277,7 +5254,7 @@ Encoding InternalDetectEncoding(
|
||||
src += exit_reason; // 1 Ascii, 2 other
|
||||
} else {
|
||||
src += exit_reason; // 1 Ascii, 2 other
|
||||
//// src = SkipToTagEnd(destate.last_pair, src, srclimitslow2);
|
||||
//// src = SkipToTagEnd(src, srclimitslow2);
|
||||
}
|
||||
|
||||
if (pruned) {
|
||||
|
||||
@ -102,16 +102,6 @@ const char* MyEncodingName(Encoding enc) {
|
||||
}
|
||||
|
||||
|
||||
// http://www.iana.org/assignments/character-sets says charset name is up to
|
||||
// 40 bytes of any printable ASCII, but that can't be right
|
||||
// when parsing HTML; at least quote is not allowed. The list
|
||||
// here includes all punctuation in all registered names as of April 2006
|
||||
static const char* kWordLetters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||
"abcdefghijklmnopqrstuvwxyz"
|
||||
"0123456789"
|
||||
"-_.:()";
|
||||
|
||||
|
||||
// Normalize ASCII string to first 4 alphabetic chars and last 4 digit chars
|
||||
// Letters are forced to lowercase ASCII
|
||||
// Used to normalize charset= values
|
||||
@ -128,7 +118,7 @@ string MakeChar44(const string& str) {
|
||||
}
|
||||
} else if (kIsDigit[uc]) {
|
||||
if (d_ptr < 4) {
|
||||
res[d_ptr + 4] = kCharsetToLowerTbl[uc];
|
||||
res[4 + d_ptr] = kCharsetToLowerTbl[uc];
|
||||
} else {
|
||||
// Keep last 4 digits by shifting left
|
||||
res[4] = res[5];
|
||||
|
||||
Loading…
Reference in New Issue
Block a user