From ef3fdd8006b022396decd846a915db510450942a Mon Sep 17 00:00:00 2001 From: RaiKoHoff Date: Thu, 16 Jan 2020 18:17:16 +0100 Subject: [PATCH] + fix/chg: UCHARDET: Confidence calculation for Single Byte Character Set (SBCS) --- Build/Notepad3.ini | 4 +- Versions/build.txt | 2 +- res/Notepad3.exe.manifest.conf | 2 +- src/Config/Config.cpp | 2 +- src/EncodingDetection.cpp | 16 +- src/Notepad3.c | 7 +- src/VersionEx.h | 2 +- uchardet/uchardet/src/CharDistribution.cpp | 12 +- uchardet/uchardet/src/CharDistribution.h | 8 +- uchardet/uchardet/src/nsCharSetProber.h | 1 - uchardet/uchardet/src/nsEscCharsetProber.h | 2 +- uchardet/uchardet/src/nsGB18030Prober.cpp | 6 +- uchardet/uchardet/src/nsHebrewProber.cpp | 2 +- uchardet/uchardet/src/nsLatin1Prober.cpp | 23 +- uchardet/uchardet/src/nsMBCSGroupProber.cpp | 45 ++- uchardet/uchardet/src/nsMBCSGroupProber.h | 11 +- uchardet/uchardet/src/nsSBCSGroupProber.cpp | 266 +++++++++--------- uchardet/uchardet/src/nsSBCSGroupProber.h | 12 +- uchardet/uchardet/src/nsSBCharSetProber.cpp | 39 ++- uchardet/uchardet/src/nsSBCharSetProber.h | 8 +- uchardet/uchardet/src/nsUTF8Prober.cpp | 10 +- uchardet/uchardet/src/nsUniversalDetector.cpp | 4 +- uchardet/uchardet/src/nscore.h | 18 +- 23 files changed, 268 insertions(+), 234 deletions(-) diff --git a/Build/Notepad3.ini b/Build/Notepad3.ini index 9495ec706..a34227077 100644 --- a/Build/Notepad3.ini +++ b/Build/Notepad3.ini @@ -1,4 +1,4 @@ -[Notepad3] +[Notepad3] ;Notepad3.ini=%USERPROFILE%\Notepad3.ini ;Notepad3.ini=%APPDATA%\Rizonesoft\Notepad3\Notepad3.ini [Settings] @@ -49,7 +49,7 @@ SettingsVersion=4 ;UndoTransactionTimeout=0 ;AdministrationTool.exe= ;DevDebugMode=0 -;AnalyzeReliableConfidenceLevel=66 +;AnalyzeReliableConfidenceLevel=70 ;LexerSQLNumberSignAsComment=1 ;ExitOnESCSkipLevel=2 [Statusbar Settings] diff --git a/Versions/build.txt b/Versions/build.txt index e31b25996..a80fc0bca 100644 --- a/Versions/build.txt +++ b/Versions/build.txt @@ -1 +1 @@ -2708 +2709 diff --git a/res/Notepad3.exe.manifest.conf b/res/Notepad3.exe.manifest.conf index fb5912c65..fe37c9301 100644 --- a/res/Notepad3.exe.manifest.conf +++ b/res/Notepad3.exe.manifest.conf @@ -3,7 +3,7 @@ Notepad3 BETA diff --git a/src/Config/Config.cpp b/src/Config/Config.cpp index 2ef2a8bb6..1054d0485 100644 --- a/src/Config/Config.cpp +++ b/src/Config/Config.cpp @@ -780,7 +780,7 @@ void LoadSettings() Settings2.NoCutLineOnEmptySelection = IniSectionGetBool(Settings2_Section, L"NoCutLineOnEmptySelection", Defaults2.NoCutLineOnEmptySelection); - int const iARCLdef = 66; + int const iARCLdef = 70; Defaults2.AnalyzeReliableConfidenceLevel = (float)iARCLdef / 100.0f; int const iARCLset = clampi(IniSectionGetInt(Settings2_Section, L"AnalyzeReliableConfidenceLevel", iARCLdef), 0, 100); Settings2.AnalyzeReliableConfidenceLevel = (float)iARCLset / 100.0f; diff --git a/src/EncodingDetection.cpp b/src/EncodingDetection.cpp index 5514065d0..a80870a7e 100644 --- a/src/EncodingDetection.cpp +++ b/src/EncodingDetection.cpp @@ -904,8 +904,8 @@ static void _SetEncodingTitleInfo(const char* encodingUCD, cpi_enc_t encUCD, flo const char* ukn = (!encodingUCD || (encodingUCD[0] == '\0')) ? "" : encodingUCD; StringCchCatA(chEncodingInfo, ARRAYSIZE(chEncodingInfo), (encUCD == CPI_ASCII_7BIT) ? "ASCII" : ukn); } - float const ucd_conf_perc = ucd_confidence * 100.0f; - StringCchPrintfA(tmpBuf, 128, "' Conf=%.0f%%", ucd_conf_perc); + int const ucd_conf_perc = float2int(ucd_confidence * 100.0f); + StringCchPrintfA(tmpBuf, ARRAYSIZE(tmpBuf), "' Conf=%i%%", ucd_conf_perc); StringCchCatA(chEncodingInfo, ARRAYSIZE(chEncodingInfo), tmpBuf); //~StringCchCatA(chEncodingInfo, ARRAYSIZE(chEncodingInfo), " || CED='"); @@ -920,15 +920,17 @@ static void _SetEncodingTitleInfo(const char* encodingUCD, cpi_enc_t encUCD, flo //~if ((encCED >= 0) || (encCED == CPI_ASCII_7BIT)) { //~ bool const ced_reliable = (ced_confidence >= Settings2.ReliableCEDConfidenceMapping); //~ bool const ced_not_reliable = (ced_confidence <= Settings2.UnReliableCEDConfidenceMapping); - //~ StringCchPrintfA(tmpBuf, 128, "' Conf=%.0f%% [%s])", ced_confidence * 100.0f, + //~ StringCchPrintfA(tmpBuf, ARRAYSIZE(tmpBuf), "' Conf=%.0f%% [%s])", ced_confidence * 100.0f, //~ ced_reliable ? "reliable" : (ced_not_reliable ? "NOT reliable" : "???")); //~ StringCchCatA(chEncodingInfo, ARRAYSIZE(chEncodingInfo), tmpBuf); //~} //~else { //~ StringCchCatA(chEncodingInfo, ARRAYSIZE(chEncodingInfo), "'"); //~} - - StringCchPrintfA(tmpBuf, ARRAYSIZE(tmpBuf), ucd_confidence >= Settings2.AnalyzeReliableConfidenceLevel ? " (reliable)" : " (NOT reliable)"); + + int const relThreshold = float2int(Settings2.AnalyzeReliableConfidenceLevel * 100.0f); + const char* rel_fmt = (ucd_conf_perc >= relThreshold) ? " (reliable (%i%%))" : " (NOT reliable(%i%%))"; + StringCchPrintfA(tmpBuf, ARRAYSIZE(tmpBuf), rel_fmt, relThreshold); StringCchCatA(chEncodingInfo, ARRAYSIZE(chEncodingInfo), tmpBuf); ::MultiByteToWideChar(CP_UTF7, 0, chEncodingInfo, -1, wchEncodingInfo, ARRAYSIZE(wchEncodingInfo)); @@ -1327,7 +1329,9 @@ extern "C" ENC_DET_T Encoding_DetectEncoding(LPWSTR pszFile, const char* lpData, } } - encDetRes.bIsAnalysisReliable = (confidence >= Settings2.AnalyzeReliableConfidenceLevel); + int const iConfidence = float2int(confidence * 100.0f); + int const iReliableThreshold = float2int(Settings2.AnalyzeReliableConfidenceLevel * 100.0f); + encDetRes.bIsAnalysisReliable = (iConfidence >= iReliableThreshold); // -------------------------------------------------------------------------- // --- choose best encoding guess ---- diff --git a/src/Notepad3.c b/src/Notepad3.c index 8a52fd45e..ee4731816 100644 --- a/src/Notepad3.c +++ b/src/Notepad3.c @@ -9843,10 +9843,13 @@ bool FileRevert(LPCWSTR szFileName, bool bIgnoreCmdLnEnc) bool bPreserveView = true; DOCVIEWPOS_T const docView = EditGetCurrentDocView(Globals.hwndEdit); + Encoding_SrcWeak(CPI_NONE); if (bIgnoreCmdLnEnc) { - Encoding_Forced(CPI_NONE); // ignore history too + Encoding_Forced(CPI_NONE); // ignore history too + } + else if (Encoding_HasChanged(Encoding_Current(CPI_GET))) { + Encoding_SrcWeak(Encoding_Current(CPI_GET)); } - Encoding_SrcWeak(Encoding_Current(CPI_GET)); WCHAR tchFileName2[MAX_PATH] = { L'\0' }; StringCchCopyW(tchFileName2, COUNTOF(tchFileName2), szFileName); diff --git a/src/VersionEx.h b/src/VersionEx.h index fceb7fc17..7fc589b7f 100644 --- a/src/VersionEx.h +++ b/src/VersionEx.h @@ -9,7 +9,7 @@ #define VERSION_MAJOR 5 #define VERSION_MINOR 20 #define VERSION_REV 116 -#define VERSION_BUILD 2708 +#define VERSION_BUILD 2709 #define SCINTILLA_VER 423 #define ONIGURUMA_REGEX_VER 6.9.4 #define UCHARDET_VER 2018.09.27 diff --git a/uchardet/uchardet/src/CharDistribution.cpp b/uchardet/uchardet/src/CharDistribution.cpp index e3339bc72..860d79516 100644 --- a/uchardet/uchardet/src/CharDistribution.cpp +++ b/uchardet/uchardet/src/CharDistribution.cpp @@ -1,4 +1,4 @@ -/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- * vim: et sw=2 ts=2 fdm=marker */ /* ***** BEGIN LICENSE BLOCK ***** @@ -46,20 +46,18 @@ //#include "LangModels/GB2312Freq.tab" #include "LangModels/GB18030Freq.tab" -#define SURE_YES 0.99f -#define SURE_NO 0.01f - //return confidence base on received data float CharDistributionAnalysis::GetConfidence() { //if we didn't receive any character in our consideration range, or the // number of frequent characters is below the minimum threshold, return // negative answer - if (mTotalChars <= 0 || mFreqChars <= mDataThreshold) + if ((mTotalChars <= 0) || (mFreqChars < mDataThreshold)) return SURE_NO; - if (mTotalChars != mFreqChars) { - float r = mFreqChars / ((mTotalChars - mFreqChars) * mTypicalDistributionRatio); + if (mTotalChars > mFreqChars) + { + float r = (float)mFreqChars / ((mTotalChars - mFreqChars) * mTypicalDistributionRatio); if (r < SURE_YES) return r; diff --git a/uchardet/uchardet/src/CharDistribution.h b/uchardet/uchardet/src/CharDistribution.h index bcb60b975..def7cef63 100644 --- a/uchardet/uchardet/src/CharDistribution.h +++ b/uchardet/uchardet/src/CharDistribution.h @@ -1,4 +1,4 @@ -/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- * vim: et sw=2 ts=2 fdm=marker */ /* ***** BEGIN LICENSE BLOCK ***** @@ -42,10 +42,6 @@ #include "nscore.h" -#define ENOUGH_DATA_THRESHOLD 4096 - -#define MINIMUM_DATA_THRESHOLD 4 - class CharDistributionAnalysis { public: @@ -92,7 +88,7 @@ public: //It is not necessary to receive all data to draw conclusion. For charset detection, // certain amount of data is enough - PRBool GotEnoughData() {return mTotalChars > ENOUGH_DATA_THRESHOLD;}; + PRBool GotEnoughData() { return (mTotalChars >= ENOUGH_DATA_THRESHOLD); }; protected: //we do not handle character base on its original encoding string, but diff --git a/uchardet/uchardet/src/nsCharSetProber.h b/uchardet/uchardet/src/nsCharSetProber.h index 27bb16a33..e7e08677d 100644 --- a/uchardet/uchardet/src/nsCharSetProber.h +++ b/uchardet/uchardet/src/nsCharSetProber.h @@ -50,7 +50,6 @@ typedef enum { eNotMe = 2 //Negative answer } nsProbingState; -#define SHORTCUT_THRESHOLD (float)0.95 class nsCharSetProber { public: diff --git a/uchardet/uchardet/src/nsEscCharsetProber.h b/uchardet/uchardet/src/nsEscCharsetProber.h index 56b3828f6..719052313 100644 --- a/uchardet/uchardet/src/nsEscCharsetProber.h +++ b/uchardet/uchardet/src/nsEscCharsetProber.h @@ -53,7 +53,7 @@ public: const char* GetCharSetName() {return mDetectedCharset;}; nsProbingState GetState(void) {return mState;}; void Reset(void); - float GetConfidence(void){return (float)0.99;}; + float GetConfidence(void){return SURE_YES;}; void SetOpion() {}; protected: diff --git a/uchardet/uchardet/src/nsGB18030Prober.cpp b/uchardet/uchardet/src/nsGB18030Prober.cpp index f6d5c5b76..760aef6d0 100644 --- a/uchardet/uchardet/src/nsGB18030Prober.cpp +++ b/uchardet/uchardet/src/nsGB18030Prober.cpp @@ -1,4 +1,4 @@ -/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * @@ -89,8 +89,6 @@ nsProbingState nsGB18030Prober::HandleData(const char* aBuf, PRUint32 aLen) float nsGB18030Prober::GetConfidence(void) { - float distribCf = mDistributionAnalyser.GetConfidence(); - - return (float)distribCf; + return mDistributionAnalyser.GetConfidence(); } diff --git a/uchardet/uchardet/src/nsHebrewProber.cpp b/uchardet/uchardet/src/nsHebrewProber.cpp index 9becb821e..6a039c094 100644 --- a/uchardet/uchardet/src/nsHebrewProber.cpp +++ b/uchardet/uchardet/src/nsHebrewProber.cpp @@ -58,7 +58,7 @@ // Minimum Visual vs Logical model score difference. // If the difference is below this, don't rely at all on the model score distance. -#define MIN_MODEL_DISTANCE (0.01) +#define MIN_MODEL_DISTANCE (0.01f) #define VISUAL_HEBREW_NAME ("ISO-8859-8") #define LOGICAL_HEBREW_NAME ("WINDOWS-1255") diff --git a/uchardet/uchardet/src/nsLatin1Prober.cpp b/uchardet/uchardet/src/nsLatin1Prober.cpp index 9dc76a789..c1cc7d8f3 100644 --- a/uchardet/uchardet/src/nsLatin1Prober.cpp +++ b/uchardet/uchardet/src/nsLatin1Prober.cpp @@ -149,23 +149,22 @@ nsProbingState nsLatin1Prober::HandleData(const char* aBuf, PRUint32 aLen) float nsLatin1Prober::GetConfidence(void) { if (mState == eNotMe) - return 0.01f; + return SURE_NO; - float confidence; PRUint32 total = 0; - for (PRInt32 i = 0; i < FREQ_CAT_NUM; i++) + for (PRInt32 i = 0; i < FREQ_CAT_NUM; i++) { total += mFreqCounter[i]; - - if(!total) - confidence = 0.0f; - else - { - confidence = mFreqCounter[3]*1.0f / total; - confidence -= mFreqCounter[1]*20.0f/total; } - if (confidence < 0.0f) - confidence = 0.0f; + float confidence = 0.0f; + + if (total) + { + confidence = (float)mFreqCounter[3] / (float)total; + confidence -= (float)mFreqCounter[1] * 20.0f / (float)total; + } + + if (confidence < 0.0f) { confidence = 0.0f; } // lower the confidence of latin1 so that other more accurate detector // can take priority. diff --git a/uchardet/uchardet/src/nsMBCSGroupProber.cpp b/uchardet/uchardet/src/nsMBCSGroupProber.cpp index 1e18f4dd6..c1051b5c9 100644 --- a/uchardet/uchardet/src/nsMBCSGroupProber.cpp +++ b/uchardet/uchardet/src/nsMBCSGroupProber.cpp @@ -59,10 +59,8 @@ const char *ProberName[] = #endif nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter) + : mNumOfProbers(MAX_NUM_OF_MBCS_PROBERS), mBestGuess(-1), mActiveNum(0) { - for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++) { - mProbers[i] = nsnull; - } PRUint32 i = 0; mProbers[i++] = new nsUTF8Prober(); if (aLanguageFilter & NS_FILTER_JAPANESE) @@ -84,14 +82,19 @@ nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter) mProbers[i++] = new nsBig5Prober(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL); mProbers[i++] = new nsEUCTWProber(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL); } + + mNumOfProbers = i; + + for (; i < MAX_NUM_OF_MBCS_PROBERS; ++i) { mProbers[i] = nsnull; } + Reset(); } nsMBCSGroupProber::~nsMBCSGroupProber() { - for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++) + for (PRUint32 i = 0; i < MAX_NUM_OF_MBCS_PROBERS; ++i) { - delete mProbers[i]; + if (mProbers[i]) { delete mProbers[i]; } } } @@ -100,8 +103,8 @@ const char* nsMBCSGroupProber::GetCharSetName() if (mBestGuess == -1) { GetConfidence(); - if (mBestGuess == -1) - mBestGuess = 0; + + if (mBestGuess == -1) { mBestGuess = 0; } } return mProbers[mBestGuess]->GetCharSetName(); } @@ -109,7 +112,7 @@ const char* nsMBCSGroupProber::GetCharSetName() void nsMBCSGroupProber::Reset(void) { mActiveNum = 0; - for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++) + for (PRUint32 i = 0; i < MAX_NUM_OF_MBCS_PROBERS; i++) { if (mProbers[i]) { @@ -144,7 +147,7 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen) { if (--keepNext == 0) { - for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++) + for (PRUint32 i = 0; i < mNumOfProbers; i++) { if (!mIsActive[i]) continue; @@ -161,7 +164,7 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen) } if (keepNext) { - for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++) + for (PRUint32 i = 0; i < mNumOfProbers; i++) { if (!mIsActive[i]) continue; @@ -179,23 +182,22 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen) return mState; } -float nsMBCSGroupProber::GetConfidence(void) +float nsMBCSGroupProber::GetConfidence() { - PRUint32 i; - float bestConf = 0.0, cf; + float bestConf = 0.0f; switch (mState) { case eFoundIt: - return (float)0.99; + return SURE_YES; case eNotMe: - return (float)0.01; + return SURE_NO; default: - for (i = 0; i < NUM_OF_PROBERS; i++) + for (PRUint32 i = 0; i < mNumOfProbers; i++) { if (!mIsActive[i]) continue; - cf = mProbers[i]->GetConfidence(); + float const cf = mProbers[i]->GetConfidence(); if (bestConf < cf) { bestConf = cf; @@ -209,17 +211,14 @@ float nsMBCSGroupProber::GetConfidence(void) #ifdef DEBUG_chardet void nsMBCSGroupProber::DumpStatus() { - PRUint32 i; - float cf; - GetConfidence(); - for (i = 0; i < NUM_OF_PROBERS; i++) + for (PRUint32 i = 0; i < mNumOfProbers; i++) { if (!mIsActive[i]) printf(" MBCS inactive: [%s] (confidence is too low).\r\n", ProberName[i]); else { - cf = mProbers[i]->GetConfidence(); + float const cf = mProbers[i]->GetConfidence(); printf(" MBCS %1.3f: [%s]\r\n", cf, ProberName[i]); } } @@ -229,7 +228,7 @@ void nsMBCSGroupProber::DumpStatus() #ifdef DEBUG_jgmyers void nsMBCSGroupProber::GetDetectorState(nsUniversalDetector::DetectorState (&states)[nsUniversalDetector::NumDetectors], PRUint32 &offset) { - for (PRUint32 i = 0; i < NUM_OF_PROBERS; ++i) { + for (PRUint32 i = 0; i < mNumOfProbers; ++i) { states[offset].name = ProberName[i]; states[offset].isActive = mIsActive[i]; states[offset].confidence = mIsActive[i] ? mProbers[i]->GetConfidence() : 0.0; diff --git a/uchardet/uchardet/src/nsMBCSGroupProber.h b/uchardet/uchardet/src/nsMBCSGroupProber.h index 42b8a6e41..df159e24d 100644 --- a/uchardet/uchardet/src/nsMBCSGroupProber.h +++ b/uchardet/uchardet/src/nsMBCSGroupProber.h @@ -1,4 +1,4 @@ -/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- * vim: et sw=2 ts=2 fdm=marker */ /* ***** BEGIN LICENSE BLOCK ***** @@ -50,7 +50,7 @@ #include "nsBig5Prober.h" #include "nsEUCTWProber.h" -#define NUM_OF_PROBERS 7 +#define MAX_NUM_OF_MBCS_PROBERS 7 class nsMBCSGroupProber: public nsCharSetProber { public: @@ -73,9 +73,10 @@ public: protected: nsProbingState mState; - nsCharSetProber* mProbers[NUM_OF_PROBERS]; - PRBool mIsActive[NUM_OF_PROBERS]; - PRInt32 mBestGuess; + nsCharSetProber* mProbers[MAX_NUM_OF_MBCS_PROBERS]; + PRBool mIsActive[MAX_NUM_OF_MBCS_PROBERS]; + PRUint32 mNumOfProbers; + PRInt32 mBestGuess; PRUint32 mActiveNum; PRUint32 mKeepNext; }; diff --git a/uchardet/uchardet/src/nsSBCSGroupProber.cpp b/uchardet/uchardet/src/nsSBCSGroupProber.cpp index 8fb778ce2..71aed6310 100644 --- a/uchardet/uchardet/src/nsSBCSGroupProber.cpp +++ b/uchardet/uchardet/src/nsSBCSGroupProber.cpp @@ -1,4 +1,4 @@ -/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- * vim: et sw=2 ts=2 fdm=marker */ /* ***** BEGIN LICENSE BLOCK ***** @@ -46,173 +46,183 @@ #include "nsHebrewProber.h" + nsSBCSGroupProber::nsSBCSGroupProber() + : mNumOfProbers(MAX_NUM_OF_SBCS_PROBERS), mBestGuess(-1), mActiveNum(0) { - mProbers[0] = new nsSingleByteCharSetProber(&Win1251RussianModel); - mProbers[1] = new nsSingleByteCharSetProber(&Koi8rRussianModel); - mProbers[2] = new nsSingleByteCharSetProber(&Latin5RussianModel); - mProbers[3] = new nsSingleByteCharSetProber(&MacCyrillicRussianModel); - mProbers[4] = new nsSingleByteCharSetProber(&Ibm866RussianModel); - mProbers[5] = new nsSingleByteCharSetProber(&Ibm855RussianModel); + PRUint32 i = 0; + mProbers[i++] = new nsSingleByteCharSetProber(&Win1251RussianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Koi8rRussianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Latin5RussianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&MacCyrillicRussianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Ibm866RussianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Ibm855RussianModel); - mProbers[6] = new nsSingleByteCharSetProber(&Iso_8859_7GreekModel); - mProbers[7] = new nsSingleByteCharSetProber(&Windows_1253GreekModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_7GreekModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1253GreekModel); - mProbers[8] = new nsSingleByteCharSetProber(&Latin5BulgarianModel); - mProbers[9] = new nsSingleByteCharSetProber(&Win1251BulgarianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Latin5BulgarianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Win1251BulgarianModel); nsHebrewProber *hebprober = new nsHebrewProber(); // Notice: Any change in these indexes - 10,11,12 must be reflected // in the code below as well. - mProbers[10] = hebprober; - mProbers[11] = new nsSingleByteCharSetProber(&Win1255Model, PR_FALSE, hebprober); // Logical Hebrew - mProbers[12] = new nsSingleByteCharSetProber(&Win1255Model, PR_TRUE, hebprober); // Visual Hebrew + PRUint32 const heb = i; + mProbers[i++] = hebprober; + mProbers[i++] = new nsSingleByteCharSetProber(&Win1255Model, PR_FALSE, hebprober); // Logical Hebrew + mProbers[i++] = new nsSingleByteCharSetProber(&Win1255Model, PR_TRUE, hebprober); // Visual Hebrew // Tell the Hebrew prober about the logical and visual probers - if (mProbers[10] && mProbers[11] && mProbers[12]) // all are not null + if (mProbers[heb] && mProbers[heb+1] && mProbers[heb+2]) // all are not null { - hebprober->SetModelProbers(mProbers[11], mProbers[12]); + hebprober->SetModelProbers(mProbers[heb+1], mProbers[heb+2]); } else // One or more is null. avoid any Hebrew probing, null them all { - for (PRUint32 i = 10; i <= 12; ++i) + for (PRUint32 j = heb + 2; j >= heb; --j) { - delete mProbers[i]; - mProbers[i] = 0; + delete mProbers[j]; + mProbers[j] = nsnull; } } + mProbers[i++] = new nsSingleByteCharSetProber(&Tis_620ThaiModel); - mProbers[13] = new nsSingleByteCharSetProber(&Tis_620ThaiModel); - mProbers[14] = new nsSingleByteCharSetProber(&Iso_8859_11ThaiModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252AfricaansModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1AfricaansModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9AfricaansModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15AfricaansModel); - mProbers[15] = new nsSingleByteCharSetProber(&Iso_8859_1FrenchModel); - mProbers[16] = new nsSingleByteCharSetProber(&Iso_8859_15FrenchModel); - mProbers[17] = new nsSingleByteCharSetProber(&Windows_1252FrenchModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1GermanModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252GermanModel); - mProbers[18] = new nsSingleByteCharSetProber(&Iso_8859_1SpanishModel); - mProbers[19] = new nsSingleByteCharSetProber(&Iso_8859_15SpanishModel); - mProbers[20] = new nsSingleByteCharSetProber(&Windows_1252SpanishModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1FrenchModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15FrenchModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252FrenchModel); - mProbers[21] = new nsSingleByteCharSetProber(&Iso_8859_2HungarianModel); - mProbers[22] = new nsSingleByteCharSetProber(&Windows_1250HungarianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1SpanishModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15SpanishModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252SpanishModel); - mProbers[23] = new nsSingleByteCharSetProber(&Iso_8859_1GermanModel); - mProbers[24] = new nsSingleByteCharSetProber(&Windows_1252GermanModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1PortugueseModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9PortugueseModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15PortugueseModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252PortugueseModel); - mProbers[25] = new nsSingleByteCharSetProber(&Iso_8859_3EsperantoModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_2HungarianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1250HungarianModel); - mProbers[26] = new nsSingleByteCharSetProber(&Iso_8859_3TurkishModel); - mProbers[27] = new nsSingleByteCharSetProber(&Iso_8859_9TurkishModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_3EsperantoModel); - mProbers[28] = new nsSingleByteCharSetProber(&Iso_8859_6ArabicModel); - mProbers[29] = new nsSingleByteCharSetProber(&Windows_1256ArabicModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_3TurkishModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9TurkishModel); - mProbers[30] = new nsSingleByteCharSetProber(&VisciiVietnameseModel); - mProbers[31] = new nsSingleByteCharSetProber(&Windows_1258VietnameseModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15DanishModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1DanishModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252DanishModel); - mProbers[32] = new nsSingleByteCharSetProber(&Iso_8859_15DanishModel); - mProbers[33] = new nsSingleByteCharSetProber(&Iso_8859_1DanishModel); - mProbers[34] = new nsSingleByteCharSetProber(&Windows_1252DanishModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_13LithuanianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_10LithuanianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_4LithuanianModel); - mProbers[35] = new nsSingleByteCharSetProber(&Iso_8859_13LithuanianModel); - mProbers[36] = new nsSingleByteCharSetProber(&Iso_8859_10LithuanianModel); - mProbers[37] = new nsSingleByteCharSetProber(&Iso_8859_4LithuanianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_13LatvianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_10LatvianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_4LatvianModel); - mProbers[38] = new nsSingleByteCharSetProber(&Iso_8859_13LatvianModel); - mProbers[39] = new nsSingleByteCharSetProber(&Iso_8859_10LatvianModel); - mProbers[40] = new nsSingleByteCharSetProber(&Iso_8859_4LatvianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_3MalteseModel); - mProbers[41] = new nsSingleByteCharSetProber(&Iso_8859_1PortugueseModel); - mProbers[42] = new nsSingleByteCharSetProber(&Iso_8859_9PortugueseModel); - mProbers[43] = new nsSingleByteCharSetProber(&Iso_8859_15PortugueseModel); - mProbers[44] = new nsSingleByteCharSetProber(&Windows_1252PortugueseModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1250CzechModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_2CzechModel); + mProbers[i++] = new nsSingleByteCharSetProber(&MaccentraleuropeCzechModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Ibm852CzechModel); - mProbers[45] = new nsSingleByteCharSetProber(&Iso_8859_3MalteseModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1250SlovakModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_2SlovakModel); + mProbers[i++] = new nsSingleByteCharSetProber(&MaccentraleuropeSlovakModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Ibm852SlovakModel); - mProbers[46] = new nsSingleByteCharSetProber(&Windows_1250CzechModel); - mProbers[47] = new nsSingleByteCharSetProber(&Iso_8859_2CzechModel); - mProbers[48] = new nsSingleByteCharSetProber(&MaccentraleuropeCzechModel); - mProbers[49] = new nsSingleByteCharSetProber(&Ibm852CzechModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1250PolishModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_2PolishModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_13PolishModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_16PolishModel); + mProbers[i++] = new nsSingleByteCharSetProber(&MaccentraleuropePolishModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Ibm852PolishModel); - mProbers[50] = new nsSingleByteCharSetProber(&Windows_1250SlovakModel); - mProbers[51] = new nsSingleByteCharSetProber(&Iso_8859_2SlovakModel); - mProbers[52] = new nsSingleByteCharSetProber(&MaccentraleuropeSlovakModel); - mProbers[53] = new nsSingleByteCharSetProber(&Ibm852SlovakModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1FinnishModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_4FinnishModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9FinnishModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_13FinnishModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15FinnishModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252FinnishModel); - mProbers[54] = new nsSingleByteCharSetProber(&Windows_1250PolishModel); - mProbers[55] = new nsSingleByteCharSetProber(&Iso_8859_2PolishModel); - mProbers[56] = new nsSingleByteCharSetProber(&Iso_8859_13PolishModel); - mProbers[57] = new nsSingleByteCharSetProber(&Iso_8859_16PolishModel); - mProbers[58] = new nsSingleByteCharSetProber(&MaccentraleuropePolishModel); - mProbers[59] = new nsSingleByteCharSetProber(&Ibm852PolishModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1ItalianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_3ItalianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9ItalianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15ItalianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252ItalianModel); - mProbers[60] = new nsSingleByteCharSetProber(&Iso_8859_1FinnishModel); - mProbers[61] = new nsSingleByteCharSetProber(&Iso_8859_4FinnishModel); - mProbers[62] = new nsSingleByteCharSetProber(&Iso_8859_9FinnishModel); - mProbers[63] = new nsSingleByteCharSetProber(&Iso_8859_13FinnishModel); - mProbers[64] = new nsSingleByteCharSetProber(&Iso_8859_15FinnishModel); - mProbers[65] = new nsSingleByteCharSetProber(&Windows_1252FinnishModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1250CroatianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_2CroatianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_13CroatianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_16CroatianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&MaccentraleuropeCroatianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Ibm852CroatianModel); - mProbers[66] = new nsSingleByteCharSetProber(&Iso_8859_1ItalianModel); - mProbers[67] = new nsSingleByteCharSetProber(&Iso_8859_3ItalianModel); - mProbers[68] = new nsSingleByteCharSetProber(&Iso_8859_9ItalianModel); - mProbers[69] = new nsSingleByteCharSetProber(&Iso_8859_15ItalianModel); - mProbers[70] = new nsSingleByteCharSetProber(&Windows_1252ItalianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252EstonianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1257EstonianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_4EstonianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_13EstonianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15EstonianModel); - mProbers[71] = new nsSingleByteCharSetProber(&Windows_1250CroatianModel); - mProbers[72] = new nsSingleByteCharSetProber(&Iso_8859_2CroatianModel); - mProbers[73] = new nsSingleByteCharSetProber(&Iso_8859_13CroatianModel); - mProbers[74] = new nsSingleByteCharSetProber(&Iso_8859_16CroatianModel); - mProbers[75] = new nsSingleByteCharSetProber(&MaccentraleuropeCroatianModel); - mProbers[76] = new nsSingleByteCharSetProber(&Ibm852CroatianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1IrishModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9IrishModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15IrishModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252IrishModel); - mProbers[77] = new nsSingleByteCharSetProber(&Windows_1252EstonianModel); - mProbers[78] = new nsSingleByteCharSetProber(&Windows_1257EstonianModel); - mProbers[79] = new nsSingleByteCharSetProber(&Iso_8859_4EstonianModel); - mProbers[80] = new nsSingleByteCharSetProber(&Iso_8859_13EstonianModel); - mProbers[81] = new nsSingleByteCharSetProber(&Iso_8859_15EstonianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1250RomanianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_2RomanianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_16RomanianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Ibm852RomanianModel); - mProbers[82] = new nsSingleByteCharSetProber(&Iso_8859_1IrishModel); - mProbers[83] = new nsSingleByteCharSetProber(&Iso_8859_9IrishModel); - mProbers[84] = new nsSingleByteCharSetProber(&Iso_8859_15IrishModel); - mProbers[85] = new nsSingleByteCharSetProber(&Windows_1252IrishModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1250SloveneModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_2SloveneModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_16SloveneModel); + mProbers[i++] = new nsSingleByteCharSetProber(&MaccentraleuropeSloveneModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Ibm852SloveneModel); - mProbers[86] = new nsSingleByteCharSetProber(&Windows_1250RomanianModel); - mProbers[87] = new nsSingleByteCharSetProber(&Iso_8859_2RomanianModel); - mProbers[88] = new nsSingleByteCharSetProber(&Iso_8859_16RomanianModel); - mProbers[89] = new nsSingleByteCharSetProber(&Ibm852RomanianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1SwedishModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_4SwedishModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9SwedishModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15SwedishModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252SwedishModel); - mProbers[90] = new nsSingleByteCharSetProber(&Windows_1250SloveneModel); - mProbers[91] = new nsSingleByteCharSetProber(&Iso_8859_2SloveneModel); - mProbers[92] = new nsSingleByteCharSetProber(&Iso_8859_16SloveneModel); - mProbers[93] = new nsSingleByteCharSetProber(&MaccentraleuropeSloveneModel); - mProbers[94] = new nsSingleByteCharSetProber(&Ibm852SloveneModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1251BelarusianModel); - mProbers[95] = new nsSingleByteCharSetProber(&Iso_8859_1SwedishModel); - mProbers[96] = new nsSingleByteCharSetProber(&Iso_8859_4SwedishModel); - mProbers[97] = new nsSingleByteCharSetProber(&Iso_8859_9SwedishModel); - mProbers[98] = new nsSingleByteCharSetProber(&Iso_8859_15SwedishModel); - mProbers[99] = new nsSingleByteCharSetProber(&Windows_1252SwedishModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252NederlandsModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1NederlandsModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9NederlandsModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15NederlandsModel); - mProbers[100] = new nsSingleByteCharSetProber(&Windows_1252AfricaansModel); - mProbers[101] = new nsSingleByteCharSetProber(&Iso_8859_1AfricaansModel); - mProbers[102] = new nsSingleByteCharSetProber(&Iso_8859_9AfricaansModel); - mProbers[103] = new nsSingleByteCharSetProber(&Iso_8859_15AfricaansModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_6ArabicModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1256ArabicModel); - mProbers[104] = new nsSingleByteCharSetProber(&Windows_1251BelarusianModel); + mProbers[i++] = new nsSingleByteCharSetProber(&VisciiVietnameseModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1258VietnameseModel); + + + //mProbers[i++] = new nsSingleByteCharSetProber(&Tis_620ThaiModel); + mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_11ThaiModel); + + mNumOfProbers = i; + + for (; i < MAX_NUM_OF_SBCS_PROBERS; ++i) { mProbers[i] = nsnull; } - mProbers[105] = new nsSingleByteCharSetProber(&Windows_1252NederlandsModel); - mProbers[106] = new nsSingleByteCharSetProber(&Iso_8859_1NederlandsModel); - mProbers[107] = new nsSingleByteCharSetProber(&Iso_8859_9NederlandsModel); - mProbers[108] = new nsSingleByteCharSetProber(&Iso_8859_15NederlandsModel); - Reset(); } nsSBCSGroupProber::~nsSBCSGroupProber() { - for (PRUint32 i = 0; i < NUM_OF_SBCS_PROBERS; i++) + for (PRUint32 i = 0; i < MAX_NUM_OF_SBCS_PROBERS; i++) { - delete mProbers[i]; + if (mProbers[i]) { delete mProbers[i]; } } } @@ -234,7 +244,7 @@ const char* nsSBCSGroupProber::GetCharSetName() void nsSBCSGroupProber::Reset(void) { mActiveNum = 0; - for (PRUint32 i = 0; i < NUM_OF_SBCS_PROBERS; i++) + for (PRUint32 i = 0; i < MAX_NUM_OF_SBCS_PROBERS; ++i) { if (mProbers[i]) // not null { @@ -242,8 +252,9 @@ void nsSBCSGroupProber::Reset(void) mIsActive[i] = PR_TRUE; ++mActiveNum; } - else + else { mIsActive[i] = PR_FALSE; + } } mBestGuess = -1; mState = eDetecting; @@ -269,7 +280,7 @@ nsProbingState nsSBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen) if (newLen1 == 0) goto done; // Nothing to see here, move on. - for (i = 0; i < NUM_OF_SBCS_PROBERS; i++) + for (i = 0; i < mNumOfProbers; i++) { if (!mIsActive[i]) continue; @@ -300,21 +311,20 @@ done: float nsSBCSGroupProber::GetConfidence(void) { - PRUint32 i; - float bestConf = 0.0, cf; + float bestConf = 0.0f; switch (mState) { case eFoundIt: - return (float)0.99; //sure yes + return SURE_YES; case eNotMe: - return (float)0.01; //sure no + return SURE_NO; default: - for (i = 0; i < NUM_OF_SBCS_PROBERS; i++) + for (PRUint32 i = 0; i < mNumOfProbers; i++) { if (!mIsActive[i]) continue; - cf = mProbers[i]->GetConfidence(); + float const cf = mProbers[i]->GetConfidence(); if (bestConf < cf) { bestConf = cf; @@ -333,7 +343,7 @@ void nsSBCSGroupProber::DumpStatus() cf = GetConfidence(); printf(" SBCS Group Prober --------begin status \r\n"); - for (i = 0; i < NUM_OF_SBCS_PROBERS; i++) + for (i = 0; i < mNumOfProbers; i++) { if (!mIsActive[i]) printf(" inactive: [%s] (i.e. confidence is too low).\r\n", mProbers[i]->GetCharSetName()); diff --git a/uchardet/uchardet/src/nsSBCSGroupProber.h b/uchardet/uchardet/src/nsSBCSGroupProber.h index 2b1b78a90..d71cbb8eb 100644 --- a/uchardet/uchardet/src/nsSBCSGroupProber.h +++ b/uchardet/uchardet/src/nsSBCSGroupProber.h @@ -1,4 +1,4 @@ -/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- * vim: et sw=2 ts=2 fdm=marker */ /* ***** BEGIN LICENSE BLOCK ***** @@ -42,10 +42,11 @@ #define nsSBCSGroupProber_h__ -#define NUM_OF_SBCS_PROBERS 109 +#define MAX_NUM_OF_SBCS_PROBERS 109 class nsCharSetProber; -class nsSBCSGroupProber: public nsCharSetProber { + +class nsSBCSGroupProber : public nsCharSetProber { public: nsSBCSGroupProber(); virtual ~nsSBCSGroupProber(); @@ -62,8 +63,9 @@ public: protected: nsProbingState mState; - nsCharSetProber* mProbers[NUM_OF_SBCS_PROBERS]; - PRBool mIsActive[NUM_OF_SBCS_PROBERS]; + nsCharSetProber* mProbers[MAX_NUM_OF_SBCS_PROBERS]; + PRBool mIsActive[MAX_NUM_OF_SBCS_PROBERS]; + PRUint32 mNumOfProbers; PRInt32 mBestGuess; PRUint32 mActiveNum; }; diff --git a/uchardet/uchardet/src/nsSBCharSetProber.cpp b/uchardet/uchardet/src/nsSBCharSetProber.cpp index d2d31168b..88d85bf26 100644 --- a/uchardet/uchardet/src/nsSBCharSetProber.cpp +++ b/uchardet/uchardet/src/nsSBCharSetProber.cpp @@ -1,4 +1,4 @@ -/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- * vim: et sw=2 ts=2 fdm=marker */ /* ***** BEGIN LICENSE BLOCK ***** @@ -82,7 +82,7 @@ nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32 if (mState == eDetecting) if (mTotalSeqs > SB_ENOUGH_REL_THRESHOLD) { - float cf = GetConfidence(); + float const cf = GetConfidence(); if (cf > POSITIVE_SHORTCUT_THRESHOLD) mState = eFoundIt; else if (cf < NEGATIVE_SHORTCUT_THRESHOLD) @@ -112,12 +112,22 @@ float nsSingleByteCharSetProber::GetConfidence(void) if (mTotalSeqs > 0) if (mTotalSeqs > mSeqCounters[NEGATIVE_CAT]*10 ) return ((float)(mTotalSeqs - mSeqCounters[NEGATIVE_CAT]*10))/mTotalSeqs * mFreqChar / mTotalChar; - return (float)0.01; + return SURE_NO; #else //POSITIVE_APPROACH - float r; - if (mTotalSeqs > 0) { - r = ((float)1.0) * mSeqCounters[POSITIVE_CAT] / mTotalSeqs / mModel->mTypicalPositiveRatio; + #define ffactor(m,d) (((d) > 0) ? ((float)(m)/(float)(d)) : 1.0f) + + PRUint32 const txtChar = (mTotalChar > mCtrlChar) ? (mTotalChar - mCtrlChar) : (mTotalSeqs << 1); + + if ((txtChar > 0) && (mTotalSeqs > 0)) + { + PRUint32 const goodSeqCnt = mSeqCounters[POSITIVE_CAT] + (mSeqCounters[PROBABLE_CAT] >> 1); + + float r = mModel->mTypicalPositiveRatio; + + // negative sequence correction factor + r *= ffactor(goodSeqCnt, mTotalSeqs + (mSeqCounters[NEGATIVE_CAT] << 4)); + /* Multiply by a ratio of positive sequences per characters. * This would help in particular to distinguish close winners. * Indeed if you add a letter, you'd expect the positive sequence count @@ -126,18 +136,21 @@ float nsSingleByteCharSetProber::GetConfidence(void) * character). This could make the difference between very closely related * charsets used for the same language. */ - r = r * mSeqCounters[POSITIVE_CAT] / mTotalChar; - //r = r * (mSeqCounters[POSITIVE_CAT] + (float) mSeqCounters[PROBABLE_CAT] / 4) / mTotalChar; + r *= ffactor(goodSeqCnt + mSeqCounters[NEUTRAL_CAT], txtChar); + /* The more control characters (proportionnaly to the size of the text), the * less confident we become in the current charset. */ - r = r * (mTotalChar - mCtrlChar) / mTotalChar; - r = r*mFreqChar/mTotalChar; - if (r >= (float)1.00) - r = (float)0.99; + r *= ffactor(txtChar, mTotalChar); + + // normalizing + r *= ffactor(mFreqChar, mTotalChar); + + if (r >= 1.00f) { r = SURE_YES; } + return r; } - return (float)0.01; + return SURE_NO; #endif } diff --git a/uchardet/uchardet/src/nsSBCharSetProber.h b/uchardet/uchardet/src/nsSBCharSetProber.h index 5097d2aea..bb15037fb 100644 --- a/uchardet/uchardet/src/nsSBCharSetProber.h +++ b/uchardet/uchardet/src/nsSBCharSetProber.h @@ -1,4 +1,4 @@ -/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- * vim: et sw=2 ts=2 fdm=marker */ /* ***** BEGIN LICENSE BLOCK ***** @@ -55,9 +55,9 @@ /* Numbers 0-9. */ #define NUM 251 -#define SB_ENOUGH_REL_THRESHOLD 1024 -#define POSITIVE_SHORTCUT_THRESHOLD (float)0.95 -#define NEGATIVE_SHORTCUT_THRESHOLD (float)0.05 +#define SB_ENOUGH_REL_THRESHOLD min(512, ENOUGH_DATA_THRESHOLD) +#define POSITIVE_SHORTCUT_THRESHOLD SHORTCUT_THRESHOLD +#define NEGATIVE_SHORTCUT_THRESHOLD (0.05f) #define SYMBOL_CAT_ORDER 250 #define NUMBER_OF_SEQ_CAT 4 #define POSITIVE_CAT (NUMBER_OF_SEQ_CAT-1) diff --git a/uchardet/uchardet/src/nsUTF8Prober.cpp b/uchardet/uchardet/src/nsUTF8Prober.cpp index 937fcc9bc..612c01cb6 100644 --- a/uchardet/uchardet/src/nsUTF8Prober.cpp +++ b/uchardet/uchardet/src/nsUTF8Prober.cpp @@ -1,4 +1,4 @@ -/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- * vim: et sw=2 ts=2 fdm=marker */ /* ***** BEGIN LICENSE BLOCK ***** @@ -71,19 +71,19 @@ nsProbingState nsUTF8Prober::HandleData(const char* aBuf, PRUint32 aLen) return mState; } -#define ONE_CHAR_PROB (float)0.50 +#define ONE_CHAR_PROB (0.50f) float nsUTF8Prober::GetConfidence(void) { - float unlike = (float)0.99; + float unlike = SURE_YES; if (mNumOfMBChar < 6) { for (PRUint32 i = 0; i < mNumOfMBChar; i++) unlike *= ONE_CHAR_PROB; - return (float)1.0 - unlike; + return (1.0f - unlike); } else - return (float)0.99; + return SURE_YES; } diff --git a/uchardet/uchardet/src/nsUniversalDetector.cpp b/uchardet/uchardet/src/nsUniversalDetector.cpp index 44247ece7..e97190fe0 100644 --- a/uchardet/uchardet/src/nsUniversalDetector.cpp +++ b/uchardet/uchardet/src/nsUniversalDetector.cpp @@ -107,8 +107,6 @@ nsUniversalDetector::Reset() } //--------------------------------------------------------------------- -#define SHORTCUT_THRESHOLD (float)0.95 -#define MINIMUM_THRESHOLD (float)0.20 nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) { @@ -243,7 +241,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) PRUint32 i; for (i = 0; i < aLen; i++) { - //other than 0xa0, if every othe character is ascii, the page is ascii + //other than 0xa0, if every other character is ascii, the page is ascii if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') //Since many Ascii only page contains NBSP { //we got a non-ascii byte (high-byte) diff --git a/uchardet/uchardet/src/nscore.h b/uchardet/uchardet/src/nscore.h index f367448c7..f506d6376 100644 --- a/uchardet/uchardet/src/nscore.h +++ b/uchardet/uchardet/src/nscore.h @@ -1,4 +1,4 @@ -/* ***** BEGIN LICENSE BLOCK ***** +/* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * * The contents of this file are subject to the Mozilla Public License Version @@ -45,9 +45,23 @@ typedef unsigned short PRUint16; typedef signed char PRInt8; typedef unsigned char PRUint8; +#define nsnull nullptr + #define PR_FALSE false #define PR_TRUE true -#define nsnull 0 + +#define MINIMUM_DATA_THRESHOLD 4 +#define ENOUGH_DATA_THRESHOLD 1024 + +#define SURE_YES (0.99f) +#define SURE_NO (0.01f) + +#define SHORTCUT_THRESHOLD (0.95f) +#define MINIMUM_THRESHOLD (0.20f) + +#ifndef min +#define min(x,y) (((x) < (y)) ? (x) : (y)) +#endif #ifdef _MSC_VER #ifdef strdup