mirror of
https://github.com/rizonesoft/Notepad3.git
synced 2026-06-11 21:03:05 +08:00
+ chg: finetuning of Single Byte Char Set (SBCS) detection
This commit is contained in:
parent
ac1ba2496f
commit
3cefe783cb
@ -1279,7 +1279,7 @@ extern "C" ENC_DET_T Encoding_DetectEncoding(LPWSTR pszFile, const char* lpData,
|
||||
if (encDetRes.analyzedEncoding == CPI_NONE)
|
||||
{
|
||||
encDetRes.analyzedEncoding = iAnalyzeFallback;
|
||||
confidence = Settings2.AnalyzeReliableConfidenceLevel / 4.0f;
|
||||
confidence = (1.0f - Settings2.AnalyzeReliableConfidenceLevel);
|
||||
}
|
||||
|
||||
if (!bSkipUTFDetection)
|
||||
@ -1322,7 +1322,7 @@ extern "C" ENC_DET_T Encoding_DetectEncoding(LPWSTR pszFile, const char* lpData,
|
||||
if (encDetRes.analyzedEncoding == CPI_NONE)
|
||||
{
|
||||
encDetRes.analyzedEncoding = iAnalyzeFallback;
|
||||
confidence = Settings2.AnalyzeReliableConfidenceLevel / 4.0f;
|
||||
confidence = (1.0f - Settings2.AnalyzeReliableConfidenceLevel);
|
||||
}
|
||||
else if (encDetRes.analyzedEncoding == CPI_ASCII_7BIT) {
|
||||
encDetRes.analyzedEncoding = asciiEnc;
|
||||
|
||||
@ -51,72 +51,41 @@ nsSBCSGroupProber::nsSBCSGroupProber()
|
||||
: mNumOfProbers(MAX_NUM_OF_SBCS_PROBERS), mBestGuess(-1), mActiveNum(0)
|
||||
{
|
||||
PRUint32 i = 0;
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Win1251RussianModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Koi8rRussianModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Latin5RussianModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&MacCyrillicRussianModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Ibm866RussianModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Ibm855RussianModel);
|
||||
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_7GreekModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1253GreekModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252GermanModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1GermanModel);
|
||||
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Latin5BulgarianModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Win1251BulgarianModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252FrenchModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1FrenchModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15FrenchModel);
|
||||
|
||||
nsHebrewProber *hebprober = new nsHebrewProber();
|
||||
// Notice: Any change in these indexes - 10,11,12 must be reflected
|
||||
// in the code below as well.
|
||||
PRUint32 const heb = i;
|
||||
mProbers[i++] = hebprober;
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Win1255Model, PR_FALSE, hebprober); // Logical Hebrew
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Win1255Model, PR_TRUE, hebprober); // Visual Hebrew
|
||||
// Tell the Hebrew prober about the logical and visual probers
|
||||
if (mProbers[heb] && mProbers[heb+1] && mProbers[heb+2]) // all are not null
|
||||
{
|
||||
hebprober->SetModelProbers(mProbers[heb+1], mProbers[heb+2]);
|
||||
}
|
||||
else // One or more is null. avoid any Hebrew probing, null them all
|
||||
{
|
||||
for (PRUint32 j = heb + 2; j >= heb; --j)
|
||||
{
|
||||
delete mProbers[j];
|
||||
mProbers[j] = nsnull;
|
||||
}
|
||||
}
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252SpanishModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1SpanishModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15SpanishModel);
|
||||
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252PortugueseModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1PortugueseModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9PortugueseModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15PortugueseModel);
|
||||
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1250HungarianModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_2HungarianModel);
|
||||
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_3EsperantoModel);
|
||||
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252AfricaansModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1AfricaansModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9AfricaansModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15AfricaansModel);
|
||||
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1GermanModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252GermanModel);
|
||||
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1FrenchModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15FrenchModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252FrenchModel);
|
||||
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1SpanishModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15SpanishModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252SpanishModel);
|
||||
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1PortugueseModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9PortugueseModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15PortugueseModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252PortugueseModel);
|
||||
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_2HungarianModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1250HungarianModel);
|
||||
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_3EsperantoModel);
|
||||
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_3TurkishModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9TurkishModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252NederlandsModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1NederlandsModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9NederlandsModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15NederlandsModel);
|
||||
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252DanishModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15DanishModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1DanishModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252DanishModel);
|
||||
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_13LithuanianModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_10LithuanianModel);
|
||||
@ -145,18 +114,18 @@ nsSBCSGroupProber::nsSBCSGroupProber()
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&MaccentraleuropePolishModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Ibm852PolishModel);
|
||||
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252FinnishModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1FinnishModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_4FinnishModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9FinnishModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_13FinnishModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15FinnishModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252FinnishModel);
|
||||
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252ItalianModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1ItalianModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_3ItalianModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9ItalianModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15ItalianModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252ItalianModel);
|
||||
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1250CroatianModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_2CroatianModel);
|
||||
@ -171,10 +140,10 @@ nsSBCSGroupProber::nsSBCSGroupProber()
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_13EstonianModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15EstonianModel);
|
||||
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252IrishModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1IrishModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9IrishModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15IrishModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252IrishModel);
|
||||
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1250RomanianModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_2RomanianModel);
|
||||
@ -187,25 +156,55 @@ nsSBCSGroupProber::nsSBCSGroupProber()
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&MaccentraleuropeSloveneModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Ibm852SloveneModel);
|
||||
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252SwedishModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1SwedishModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_4SwedishModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9SwedishModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15SwedishModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252SwedishModel);
|
||||
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1251BelarusianModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Win1251RussianModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Koi8rRussianModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Latin5RussianModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&MacCyrillicRussianModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Ibm866RussianModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Ibm855RussianModel);
|
||||
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_7GreekModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1253GreekModel);
|
||||
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Latin5BulgarianModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Win1251BulgarianModel);
|
||||
|
||||
nsHebrewProber* hebprober = new nsHebrewProber();
|
||||
// Notice: Any change in these indexes - 10,11,12 must be reflected
|
||||
// in the code below as well.
|
||||
PRUint32 const heb = i;
|
||||
mProbers[i++] = hebprober;
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Win1255Model, PR_FALSE, hebprober); // Logical Hebrew
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Win1255Model, PR_TRUE, hebprober); // Visual Hebrew
|
||||
// Tell the Hebrew prober about the logical and visual probers
|
||||
if (mProbers[heb] && mProbers[heb + 1] && mProbers[heb + 2]) // all are not null
|
||||
{
|
||||
hebprober->SetModelProbers(mProbers[heb + 1], mProbers[heb + 2]);
|
||||
}
|
||||
else // One or more is null. avoid any Hebrew probing, null them all
|
||||
{
|
||||
for (PRUint32 j = heb + 2; j >= heb; --j)
|
||||
{
|
||||
delete mProbers[j];
|
||||
mProbers[j] = nsnull;
|
||||
}
|
||||
}
|
||||
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_3TurkishModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9TurkishModel);
|
||||
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252NederlandsModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1NederlandsModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9NederlandsModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15NederlandsModel);
|
||||
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_6ArabicModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1256ArabicModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_6ArabicModel);
|
||||
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&VisciiVietnameseModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1258VietnameseModel);
|
||||
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&VisciiVietnameseModel);
|
||||
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Tis_620ThaiModel);
|
||||
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_11ThaiModel);
|
||||
|
||||
@ -83,7 +83,7 @@ nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32
|
||||
if (mTotalSeqs > SB_ENOUGH_REL_THRESHOLD)
|
||||
{
|
||||
float const cf = GetConfidence();
|
||||
if (cf > POSITIVE_SHORTCUT_THRESHOLD)
|
||||
if (cf >= POSITIVE_SHORTCUT_THRESHOLD)
|
||||
mState = eFoundIt;
|
||||
else if (cf < NEGATIVE_SHORTCUT_THRESHOLD)
|
||||
mState = eNotMe;
|
||||
@ -116,14 +116,13 @@ float nsSingleByteCharSetProber::GetConfidence()
|
||||
if ((mTotalChar > 0) && (mTotalSeqs > 0))
|
||||
{
|
||||
// weighted good sequence count
|
||||
PRUint32 const validSeqs = mSeqCounters[POSITIVE_CAT]
|
||||
+ (mSeqCounters[PROBABLE_CAT] >> 1)
|
||||
+ (mSeqCounters[NEUTRAL_CAT] >> 2);
|
||||
PRUint32 const probableSeqs = mSeqCounters[POSITIVE_CAT] + (mSeqCounters[PROBABLE_CAT] >> 2);
|
||||
PRUint32 const validSeqs = mTotalSeqs - mSeqCounters[NEGATIVE_CAT];
|
||||
|
||||
float r = mModel->mTypicalPositiveRatio;
|
||||
float r = rfactor(mSeqCounters[POSITIVE_CAT], mTotalSeqs) / mModel->mTypicalPositiveRatio;
|
||||
|
||||
// negative sequence correction factor
|
||||
r *= rfactor(validSeqs, (mTotalSeqs + ((validSeqs >> 2) * mSeqCounters[NEGATIVE_CAT])));
|
||||
r *= rfactor(validSeqs, (mTotalSeqs + (netChars * mSeqCounters[NEGATIVE_CAT])));
|
||||
|
||||
/* Multiply by a ratio of positive sequences per characters.
|
||||
* This would help in particular to distinguish close winners.
|
||||
@ -133,9 +132,9 @@ float nsSingleByteCharSetProber::GetConfidence()
|
||||
* character). This could make the difference between very closely related
|
||||
* charsets used for the same language.
|
||||
*/
|
||||
r *= rfactor(validSeqs, netChars);
|
||||
r *= rfactor(validSeqs, netChars);
|
||||
|
||||
/* The more control characters (proportionnaly to the size of the text), the
|
||||
/* The more control characters (proportionally to the size of the text), the
|
||||
* less confident we become in the current charset.
|
||||
*/
|
||||
r *= rfactor(netChars, mTotalChar);
|
||||
|
||||
@ -55,7 +55,7 @@
|
||||
/* Numbers 0-9. */
|
||||
#define NUM 251
|
||||
|
||||
#define SB_ENOUGH_REL_THRESHOLD min(512, ENOUGH_DATA_THRESHOLD)
|
||||
#define SB_ENOUGH_REL_THRESHOLD (ENOUGH_DATA_THRESHOLD >> 1)
|
||||
#define POSITIVE_SHORTCUT_THRESHOLD SHORTCUT_THRESHOLD
|
||||
#define NEGATIVE_SHORTCUT_THRESHOLD (0.05f)
|
||||
#define SYMBOL_CAT_ORDER 250
|
||||
@ -83,6 +83,7 @@ class nsSingleByteCharSetProber : public nsCharSetProber{
|
||||
public:
|
||||
nsSingleByteCharSetProber(const SequenceModel *model)
|
||||
:mModel(model), mReversed(PR_FALSE), mNameProber(0) { Reset(); }
|
||||
|
||||
nsSingleByteCharSetProber(const SequenceModel *model, PRBool reversed, nsCharSetProber* nameProber)
|
||||
:mModel(model), mReversed(reversed), mNameProber(nameProber) { Reset(); }
|
||||
|
||||
|
||||
@ -55,7 +55,6 @@ nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter)
|
||||
{
|
||||
mNbspFound = PR_FALSE;
|
||||
mDone = PR_FALSE;
|
||||
mBestGuess = -1; //illegal value as signal
|
||||
mInTag = PR_FALSE;
|
||||
mEscCharSetProber = nsnull;
|
||||
|
||||
@ -86,7 +85,6 @@ nsUniversalDetector::Reset()
|
||||
{
|
||||
mNbspFound = PR_FALSE;
|
||||
mDone = PR_FALSE;
|
||||
mBestGuess = -1; //illegal value as signal
|
||||
mInTag = PR_FALSE;
|
||||
|
||||
mStart = PR_TRUE;
|
||||
@ -365,15 +363,14 @@ void nsUniversalDetector::DataEnd()
|
||||
{
|
||||
case eHighbyte:
|
||||
{
|
||||
float proberConfidence;
|
||||
float maxProberConfidence = (float)0.0;
|
||||
PRInt32 maxProber = 0;
|
||||
float maxProberConfidence = 0.0f;
|
||||
PRInt32 maxProber = -1;
|
||||
|
||||
for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
|
||||
for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; ++i)
|
||||
{
|
||||
if (mCharSetProbers[i])
|
||||
{
|
||||
proberConfidence = mCharSetProbers[i]->GetConfidence();
|
||||
float const proberConfidence = mCharSetProbers[i]->GetConfidence();
|
||||
if (proberConfidence > maxProberConfidence)
|
||||
{
|
||||
maxProberConfidence = proberConfidence;
|
||||
@ -382,10 +379,10 @@ void nsUniversalDetector::DataEnd()
|
||||
}
|
||||
}
|
||||
mDetectedConfidence = maxProberConfidence;
|
||||
|
||||
//do not report anything because we are not confident of it, that's in fact a negative answer
|
||||
if (maxProberConfidence > MINIMUM_THRESHOLD) {
|
||||
Report(mCharSetProbers[maxProber]->GetCharSetName(), mCharSetProbers[maxProber]->GetConfidence());
|
||||
mDetectedConfidence = mCharSetProbers[maxProber]->GetConfidence();
|
||||
if ((maxProber >= 0) && (maxProberConfidence > MINIMUM_THRESHOLD)) {
|
||||
Report(mCharSetProbers[maxProber]->GetCharSetName(), maxProberConfidence);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
@ -86,7 +86,6 @@ protected:
|
||||
const char * mDetectedCharset;
|
||||
float mDetectedConfidence;
|
||||
short mDetectedIsBOM;
|
||||
PRInt32 mBestGuess;
|
||||
PRUint32 mLanguageFilter;
|
||||
|
||||
nsCharSetProber *mCharSetProbers[NUM_OF_CHARSET_PROBERS];
|
||||
|
||||
Loading…
Reference in New Issue
Block a user