From 3cefe783cb200b4d4f28bdee0fcdc3f8fc3906d9 Mon Sep 17 00:00:00 2001 From: RaiKoHoff Date: Tue, 28 Jan 2020 18:42:01 +0100 Subject: [PATCH] + chg: finetuning of Single Byte Char Set (SBCS) detection --- src/EncodingDetection.cpp | 4 +- uchardet/uchardet/src/nsSBCSGroupProber.cpp | 131 +++++++++--------- uchardet/uchardet/src/nsSBCharSetProber.cpp | 15 +- uchardet/uchardet/src/nsSBCharSetProber.h | 3 +- uchardet/uchardet/src/nsUniversalDetector.cpp | 17 +-- uchardet/uchardet/src/nsUniversalDetector.h | 1 - 6 files changed, 83 insertions(+), 88 deletions(-) diff --git a/src/EncodingDetection.cpp b/src/EncodingDetection.cpp index a80870a7e..7bde97701 100644 --- a/src/EncodingDetection.cpp +++ b/src/EncodingDetection.cpp @@ -1279,7 +1279,7 @@ extern "C" ENC_DET_T Encoding_DetectEncoding(LPWSTR pszFile, const char* lpData, if (encDetRes.analyzedEncoding == CPI_NONE) { encDetRes.analyzedEncoding = iAnalyzeFallback; - confidence = Settings2.AnalyzeReliableConfidenceLevel / 4.0f; + confidence = (1.0f - Settings2.AnalyzeReliableConfidenceLevel); } if (!bSkipUTFDetection) @@ -1322,7 +1322,7 @@ extern "C" ENC_DET_T Encoding_DetectEncoding(LPWSTR pszFile, const char* lpData, if (encDetRes.analyzedEncoding == CPI_NONE) { encDetRes.analyzedEncoding = iAnalyzeFallback; - confidence = Settings2.AnalyzeReliableConfidenceLevel / 4.0f; + confidence = (1.0f - Settings2.AnalyzeReliableConfidenceLevel); } else if (encDetRes.analyzedEncoding == CPI_ASCII_7BIT) { encDetRes.analyzedEncoding = asciiEnc; diff --git a/uchardet/uchardet/src/nsSBCSGroupProber.cpp b/uchardet/uchardet/src/nsSBCSGroupProber.cpp index 746aa5200..e3e42df8d 100644 --- a/uchardet/uchardet/src/nsSBCSGroupProber.cpp +++ b/uchardet/uchardet/src/nsSBCSGroupProber.cpp @@ -51,72 +51,41 @@ nsSBCSGroupProber::nsSBCSGroupProber() : mNumOfProbers(MAX_NUM_OF_SBCS_PROBERS), mBestGuess(-1), mActiveNum(0) { PRUint32 i = 0; - mProbers[i++] = new nsSingleByteCharSetProber(&Win1251RussianModel); - mProbers[i++] = new nsSingleByteCharSetProber(&Koi8rRussianModel); - mProbers[i++] = new nsSingleByteCharSetProber(&Latin5RussianModel); - mProbers[i++] = new nsSingleByteCharSetProber(&MacCyrillicRussianModel); - mProbers[i++] = new nsSingleByteCharSetProber(&Ibm866RussianModel); - mProbers[i++] = new nsSingleByteCharSetProber(&Ibm855RussianModel); - mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_7GreekModel); - mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1253GreekModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252GermanModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1GermanModel); - mProbers[i++] = new nsSingleByteCharSetProber(&Latin5BulgarianModel); - mProbers[i++] = new nsSingleByteCharSetProber(&Win1251BulgarianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252FrenchModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1FrenchModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15FrenchModel); - nsHebrewProber *hebprober = new nsHebrewProber(); - // Notice: Any change in these indexes - 10,11,12 must be reflected - // in the code below as well. - PRUint32 const heb = i; - mProbers[i++] = hebprober; - mProbers[i++] = new nsSingleByteCharSetProber(&Win1255Model, PR_FALSE, hebprober); // Logical Hebrew - mProbers[i++] = new nsSingleByteCharSetProber(&Win1255Model, PR_TRUE, hebprober); // Visual Hebrew - // Tell the Hebrew prober about the logical and visual probers - if (mProbers[heb] && mProbers[heb+1] && mProbers[heb+2]) // all are not null - { - hebprober->SetModelProbers(mProbers[heb+1], mProbers[heb+2]); - } - else // One or more is null. avoid any Hebrew probing, null them all - { - for (PRUint32 j = heb + 2; j >= heb; --j) - { - delete mProbers[j]; - mProbers[j] = nsnull; - } - } + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252SpanishModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1SpanishModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15SpanishModel); + + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252PortugueseModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1PortugueseModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9PortugueseModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15PortugueseModel); + + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1250HungarianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_2HungarianModel); + + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_3EsperantoModel); mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252AfricaansModel); mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1AfricaansModel); mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9AfricaansModel); mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15AfricaansModel); - mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1GermanModel); - mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252GermanModel); - - mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1FrenchModel); - mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15FrenchModel); - mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252FrenchModel); - - mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1SpanishModel); - mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15SpanishModel); - mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252SpanishModel); - - mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1PortugueseModel); - mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9PortugueseModel); - mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15PortugueseModel); - mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252PortugueseModel); - - mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_2HungarianModel); - mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1250HungarianModel); - - mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_3EsperantoModel); - - mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_3TurkishModel); - mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9TurkishModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252NederlandsModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1NederlandsModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9NederlandsModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15NederlandsModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252DanishModel); mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15DanishModel); mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1DanishModel); - mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252DanishModel); mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_13LithuanianModel); mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_10LithuanianModel); @@ -145,18 +114,18 @@ nsSBCSGroupProber::nsSBCSGroupProber() mProbers[i++] = new nsSingleByteCharSetProber(&MaccentraleuropePolishModel); mProbers[i++] = new nsSingleByteCharSetProber(&Ibm852PolishModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252FinnishModel); mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1FinnishModel); mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_4FinnishModel); mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9FinnishModel); mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_13FinnishModel); mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15FinnishModel); - mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252FinnishModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252ItalianModel); mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1ItalianModel); mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_3ItalianModel); mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9ItalianModel); mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15ItalianModel); - mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252ItalianModel); mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1250CroatianModel); mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_2CroatianModel); @@ -171,10 +140,10 @@ nsSBCSGroupProber::nsSBCSGroupProber() mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_13EstonianModel); mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15EstonianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252IrishModel); mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1IrishModel); mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9IrishModel); mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15IrishModel); - mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252IrishModel); mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1250RomanianModel); mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_2RomanianModel); @@ -187,25 +156,55 @@ nsSBCSGroupProber::nsSBCSGroupProber() mProbers[i++] = new nsSingleByteCharSetProber(&MaccentraleuropeSloveneModel); mProbers[i++] = new nsSingleByteCharSetProber(&Ibm852SloveneModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252SwedishModel); mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1SwedishModel); mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_4SwedishModel); mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9SwedishModel); mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15SwedishModel); - mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252SwedishModel); mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1251BelarusianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Win1251RussianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Koi8rRussianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Latin5RussianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&MacCyrillicRussianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Ibm866RussianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Ibm855RussianModel); + + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_7GreekModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1253GreekModel); + + mProbers[i++] = new nsSingleByteCharSetProber(&Latin5BulgarianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Win1251BulgarianModel); + + nsHebrewProber* hebprober = new nsHebrewProber(); + // Notice: Any change in these indexes - 10,11,12 must be reflected + // in the code below as well. + PRUint32 const heb = i; + mProbers[i++] = hebprober; + mProbers[i++] = new nsSingleByteCharSetProber(&Win1255Model, PR_FALSE, hebprober); // Logical Hebrew + mProbers[i++] = new nsSingleByteCharSetProber(&Win1255Model, PR_TRUE, hebprober); // Visual Hebrew + // Tell the Hebrew prober about the logical and visual probers + if (mProbers[heb] && mProbers[heb + 1] && mProbers[heb + 2]) // all are not null + { + hebprober->SetModelProbers(mProbers[heb + 1], mProbers[heb + 2]); + } + else // One or more is null. avoid any Hebrew probing, null them all + { + for (PRUint32 j = heb + 2; j >= heb; --j) + { + delete mProbers[j]; + mProbers[j] = nsnull; + } + } + + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_3TurkishModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9TurkishModel); - mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252NederlandsModel); - mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1NederlandsModel); - mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9NederlandsModel); - mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15NederlandsModel); - - mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_6ArabicModel); mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1256ArabicModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_6ArabicModel); - mProbers[i++] = new nsSingleByteCharSetProber(&VisciiVietnameseModel); mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1258VietnameseModel); - + mProbers[i++] = new nsSingleByteCharSetProber(&VisciiVietnameseModel); mProbers[i++] = new nsSingleByteCharSetProber(&Tis_620ThaiModel); mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_11ThaiModel); diff --git a/uchardet/uchardet/src/nsSBCharSetProber.cpp b/uchardet/uchardet/src/nsSBCharSetProber.cpp index 4b1a6a72f..4ac053476 100644 --- a/uchardet/uchardet/src/nsSBCharSetProber.cpp +++ b/uchardet/uchardet/src/nsSBCharSetProber.cpp @@ -83,7 +83,7 @@ nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32 if (mTotalSeqs > SB_ENOUGH_REL_THRESHOLD) { float const cf = GetConfidence(); - if (cf > POSITIVE_SHORTCUT_THRESHOLD) + if (cf >= POSITIVE_SHORTCUT_THRESHOLD) mState = eFoundIt; else if (cf < NEGATIVE_SHORTCUT_THRESHOLD) mState = eNotMe; @@ -116,14 +116,13 @@ float nsSingleByteCharSetProber::GetConfidence() if ((mTotalChar > 0) && (mTotalSeqs > 0)) { // weighted good sequence count - PRUint32 const validSeqs = mSeqCounters[POSITIVE_CAT] - + (mSeqCounters[PROBABLE_CAT] >> 1) - + (mSeqCounters[NEUTRAL_CAT] >> 2); + PRUint32 const probableSeqs = mSeqCounters[POSITIVE_CAT] + (mSeqCounters[PROBABLE_CAT] >> 2); + PRUint32 const validSeqs = mTotalSeqs - mSeqCounters[NEGATIVE_CAT]; - float r = mModel->mTypicalPositiveRatio; + float r = rfactor(mSeqCounters[POSITIVE_CAT], mTotalSeqs) / mModel->mTypicalPositiveRatio; // negative sequence correction factor - r *= rfactor(validSeqs, (mTotalSeqs + ((validSeqs >> 2) * mSeqCounters[NEGATIVE_CAT]))); + r *= rfactor(validSeqs, (mTotalSeqs + (netChars * mSeqCounters[NEGATIVE_CAT]))); /* Multiply by a ratio of positive sequences per characters. * This would help in particular to distinguish close winners. @@ -133,9 +132,9 @@ float nsSingleByteCharSetProber::GetConfidence() * character). This could make the difference between very closely related * charsets used for the same language. */ - r *= rfactor(validSeqs, netChars); + r *= rfactor(validSeqs, netChars); - /* The more control characters (proportionnaly to the size of the text), the + /* The more control characters (proportionally to the size of the text), the * less confident we become in the current charset. */ r *= rfactor(netChars, mTotalChar); diff --git a/uchardet/uchardet/src/nsSBCharSetProber.h b/uchardet/uchardet/src/nsSBCharSetProber.h index bb15037fb..7c5f282d8 100644 --- a/uchardet/uchardet/src/nsSBCharSetProber.h +++ b/uchardet/uchardet/src/nsSBCharSetProber.h @@ -55,7 +55,7 @@ /* Numbers 0-9. */ #define NUM 251 -#define SB_ENOUGH_REL_THRESHOLD min(512, ENOUGH_DATA_THRESHOLD) +#define SB_ENOUGH_REL_THRESHOLD (ENOUGH_DATA_THRESHOLD >> 1) #define POSITIVE_SHORTCUT_THRESHOLD SHORTCUT_THRESHOLD #define NEGATIVE_SHORTCUT_THRESHOLD (0.05f) #define SYMBOL_CAT_ORDER 250 @@ -83,6 +83,7 @@ class nsSingleByteCharSetProber : public nsCharSetProber{ public: nsSingleByteCharSetProber(const SequenceModel *model) :mModel(model), mReversed(PR_FALSE), mNameProber(0) { Reset(); } + nsSingleByteCharSetProber(const SequenceModel *model, PRBool reversed, nsCharSetProber* nameProber) :mModel(model), mReversed(reversed), mNameProber(nameProber) { Reset(); } diff --git a/uchardet/uchardet/src/nsUniversalDetector.cpp b/uchardet/uchardet/src/nsUniversalDetector.cpp index e97190fe0..7c556c098 100644 --- a/uchardet/uchardet/src/nsUniversalDetector.cpp +++ b/uchardet/uchardet/src/nsUniversalDetector.cpp @@ -55,7 +55,6 @@ nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter) { mNbspFound = PR_FALSE; mDone = PR_FALSE; - mBestGuess = -1; //illegal value as signal mInTag = PR_FALSE; mEscCharSetProber = nsnull; @@ -86,7 +85,6 @@ nsUniversalDetector::Reset() { mNbspFound = PR_FALSE; mDone = PR_FALSE; - mBestGuess = -1; //illegal value as signal mInTag = PR_FALSE; mStart = PR_TRUE; @@ -365,15 +363,14 @@ void nsUniversalDetector::DataEnd() { case eHighbyte: { - float proberConfidence; - float maxProberConfidence = (float)0.0; - PRInt32 maxProber = 0; + float maxProberConfidence = 0.0f; + PRInt32 maxProber = -1; - for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++) + for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; ++i) { if (mCharSetProbers[i]) { - proberConfidence = mCharSetProbers[i]->GetConfidence(); + float const proberConfidence = mCharSetProbers[i]->GetConfidence(); if (proberConfidence > maxProberConfidence) { maxProberConfidence = proberConfidence; @@ -382,10 +379,10 @@ void nsUniversalDetector::DataEnd() } } mDetectedConfidence = maxProberConfidence; + //do not report anything because we are not confident of it, that's in fact a negative answer - if (maxProberConfidence > MINIMUM_THRESHOLD) { - Report(mCharSetProbers[maxProber]->GetCharSetName(), mCharSetProbers[maxProber]->GetConfidence()); - mDetectedConfidence = mCharSetProbers[maxProber]->GetConfidence(); + if ((maxProber >= 0) && (maxProberConfidence > MINIMUM_THRESHOLD)) { + Report(mCharSetProbers[maxProber]->GetCharSetName(), maxProberConfidence); } } break; diff --git a/uchardet/uchardet/src/nsUniversalDetector.h b/uchardet/uchardet/src/nsUniversalDetector.h index 0ddaf155b..45d93dab7 100644 --- a/uchardet/uchardet/src/nsUniversalDetector.h +++ b/uchardet/uchardet/src/nsUniversalDetector.h @@ -86,7 +86,6 @@ protected: const char * mDetectedCharset; float mDetectedConfidence; short mDetectedIsBOM; - PRInt32 mBestGuess; PRUint32 mLanguageFilter; nsCharSetProber *mCharSetProbers[NUM_OF_CHARSET_PROBERS];