+ chg: finetuning of Single Byte Char Set (SBCS) detection

This commit is contained in:
RaiKoHoff 2020-01-28 18:42:01 +01:00
parent ac1ba2496f
commit 3cefe783cb
6 changed files with 83 additions and 88 deletions

View File

@ -1279,7 +1279,7 @@ extern "C" ENC_DET_T Encoding_DetectEncoding(LPWSTR pszFile, const char* lpData,
if (encDetRes.analyzedEncoding == CPI_NONE)
{
encDetRes.analyzedEncoding = iAnalyzeFallback;
confidence = Settings2.AnalyzeReliableConfidenceLevel / 4.0f;
confidence = (1.0f - Settings2.AnalyzeReliableConfidenceLevel);
}
if (!bSkipUTFDetection)
@ -1322,7 +1322,7 @@ extern "C" ENC_DET_T Encoding_DetectEncoding(LPWSTR pszFile, const char* lpData,
if (encDetRes.analyzedEncoding == CPI_NONE)
{
encDetRes.analyzedEncoding = iAnalyzeFallback;
confidence = Settings2.AnalyzeReliableConfidenceLevel / 4.0f;
confidence = (1.0f - Settings2.AnalyzeReliableConfidenceLevel);
}
else if (encDetRes.analyzedEncoding == CPI_ASCII_7BIT) {
encDetRes.analyzedEncoding = asciiEnc;

View File

@ -51,72 +51,41 @@ nsSBCSGroupProber::nsSBCSGroupProber()
: mNumOfProbers(MAX_NUM_OF_SBCS_PROBERS), mBestGuess(-1), mActiveNum(0)
{
PRUint32 i = 0;
mProbers[i++] = new nsSingleByteCharSetProber(&Win1251RussianModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Koi8rRussianModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Latin5RussianModel);
mProbers[i++] = new nsSingleByteCharSetProber(&MacCyrillicRussianModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Ibm866RussianModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Ibm855RussianModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_7GreekModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1253GreekModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252GermanModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1GermanModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Latin5BulgarianModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Win1251BulgarianModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252FrenchModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1FrenchModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15FrenchModel);
nsHebrewProber *hebprober = new nsHebrewProber();
// Notice: Any change in these indexes - 10,11,12 must be reflected
// in the code below as well.
PRUint32 const heb = i;
mProbers[i++] = hebprober;
mProbers[i++] = new nsSingleByteCharSetProber(&Win1255Model, PR_FALSE, hebprober); // Logical Hebrew
mProbers[i++] = new nsSingleByteCharSetProber(&Win1255Model, PR_TRUE, hebprober); // Visual Hebrew
// Tell the Hebrew prober about the logical and visual probers
if (mProbers[heb] && mProbers[heb+1] && mProbers[heb+2]) // all are not null
{
hebprober->SetModelProbers(mProbers[heb+1], mProbers[heb+2]);
}
else // One or more is null. avoid any Hebrew probing, null them all
{
for (PRUint32 j = heb + 2; j >= heb; --j)
{
delete mProbers[j];
mProbers[j] = nsnull;
}
}
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252SpanishModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1SpanishModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15SpanishModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252PortugueseModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1PortugueseModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9PortugueseModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15PortugueseModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1250HungarianModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_2HungarianModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_3EsperantoModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252AfricaansModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1AfricaansModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9AfricaansModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15AfricaansModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1GermanModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252GermanModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1FrenchModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15FrenchModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252FrenchModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1SpanishModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15SpanishModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252SpanishModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1PortugueseModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9PortugueseModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15PortugueseModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252PortugueseModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_2HungarianModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1250HungarianModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_3EsperantoModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_3TurkishModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9TurkishModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252NederlandsModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1NederlandsModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9NederlandsModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15NederlandsModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252DanishModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15DanishModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1DanishModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252DanishModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_13LithuanianModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_10LithuanianModel);
@ -145,18 +114,18 @@ nsSBCSGroupProber::nsSBCSGroupProber()
mProbers[i++] = new nsSingleByteCharSetProber(&MaccentraleuropePolishModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Ibm852PolishModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252FinnishModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1FinnishModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_4FinnishModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9FinnishModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_13FinnishModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15FinnishModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252FinnishModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252ItalianModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1ItalianModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_3ItalianModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9ItalianModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15ItalianModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252ItalianModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1250CroatianModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_2CroatianModel);
@ -171,10 +140,10 @@ nsSBCSGroupProber::nsSBCSGroupProber()
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_13EstonianModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15EstonianModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252IrishModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1IrishModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9IrishModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15IrishModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252IrishModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1250RomanianModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_2RomanianModel);
@ -187,25 +156,55 @@ nsSBCSGroupProber::nsSBCSGroupProber()
mProbers[i++] = new nsSingleByteCharSetProber(&MaccentraleuropeSloveneModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Ibm852SloveneModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252SwedishModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1SwedishModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_4SwedishModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9SwedishModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15SwedishModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252SwedishModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1251BelarusianModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Win1251RussianModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Koi8rRussianModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Latin5RussianModel);
mProbers[i++] = new nsSingleByteCharSetProber(&MacCyrillicRussianModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Ibm866RussianModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Ibm855RussianModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_7GreekModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1253GreekModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Latin5BulgarianModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Win1251BulgarianModel);
nsHebrewProber* hebprober = new nsHebrewProber();
// Notice: Any change in these indexes - 10,11,12 must be reflected
// in the code below as well.
PRUint32 const heb = i;
mProbers[i++] = hebprober;
mProbers[i++] = new nsSingleByteCharSetProber(&Win1255Model, PR_FALSE, hebprober); // Logical Hebrew
mProbers[i++] = new nsSingleByteCharSetProber(&Win1255Model, PR_TRUE, hebprober); // Visual Hebrew
// Tell the Hebrew prober about the logical and visual probers
if (mProbers[heb] && mProbers[heb + 1] && mProbers[heb + 2]) // all are not null
{
hebprober->SetModelProbers(mProbers[heb + 1], mProbers[heb + 2]);
}
else // One or more is null. avoid any Hebrew probing, null them all
{
for (PRUint32 j = heb + 2; j >= heb; --j)
{
delete mProbers[j];
mProbers[j] = nsnull;
}
}
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_3TurkishModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9TurkishModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1252NederlandsModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_1NederlandsModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_9NederlandsModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_15NederlandsModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_6ArabicModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1256ArabicModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_6ArabicModel);
mProbers[i++] = new nsSingleByteCharSetProber(&VisciiVietnameseModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Windows_1258VietnameseModel);
mProbers[i++] = new nsSingleByteCharSetProber(&VisciiVietnameseModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Tis_620ThaiModel);
mProbers[i++] = new nsSingleByteCharSetProber(&Iso_8859_11ThaiModel);

View File

@ -83,7 +83,7 @@ nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32
if (mTotalSeqs > SB_ENOUGH_REL_THRESHOLD)
{
float const cf = GetConfidence();
if (cf > POSITIVE_SHORTCUT_THRESHOLD)
if (cf >= POSITIVE_SHORTCUT_THRESHOLD)
mState = eFoundIt;
else if (cf < NEGATIVE_SHORTCUT_THRESHOLD)
mState = eNotMe;
@ -116,14 +116,13 @@ float nsSingleByteCharSetProber::GetConfidence()
if ((mTotalChar > 0) && (mTotalSeqs > 0))
{
// weighted good sequence count
PRUint32 const validSeqs = mSeqCounters[POSITIVE_CAT]
+ (mSeqCounters[PROBABLE_CAT] >> 1)
+ (mSeqCounters[NEUTRAL_CAT] >> 2);
PRUint32 const probableSeqs = mSeqCounters[POSITIVE_CAT] + (mSeqCounters[PROBABLE_CAT] >> 2);
PRUint32 const validSeqs = mTotalSeqs - mSeqCounters[NEGATIVE_CAT];
float r = mModel->mTypicalPositiveRatio;
float r = rfactor(mSeqCounters[POSITIVE_CAT], mTotalSeqs) / mModel->mTypicalPositiveRatio;
// negative sequence correction factor
r *= rfactor(validSeqs, (mTotalSeqs + ((validSeqs >> 2) * mSeqCounters[NEGATIVE_CAT])));
r *= rfactor(validSeqs, (mTotalSeqs + (netChars * mSeqCounters[NEGATIVE_CAT])));
/* Multiply by a ratio of positive sequences per characters.
* This would help in particular to distinguish close winners.
@ -133,9 +132,9 @@ float nsSingleByteCharSetProber::GetConfidence()
* character). This could make the difference between very closely related
* charsets used for the same language.
*/
r *= rfactor(validSeqs, netChars);
r *= rfactor(validSeqs, netChars);
/* The more control characters (proportionnaly to the size of the text), the
/* The more control characters (proportionally to the size of the text), the
* less confident we become in the current charset.
*/
r *= rfactor(netChars, mTotalChar);

View File

@ -55,7 +55,7 @@
/* Numbers 0-9. */
#define NUM 251
#define SB_ENOUGH_REL_THRESHOLD min(512, ENOUGH_DATA_THRESHOLD)
#define SB_ENOUGH_REL_THRESHOLD (ENOUGH_DATA_THRESHOLD >> 1)
#define POSITIVE_SHORTCUT_THRESHOLD SHORTCUT_THRESHOLD
#define NEGATIVE_SHORTCUT_THRESHOLD (0.05f)
#define SYMBOL_CAT_ORDER 250
@ -83,6 +83,7 @@ class nsSingleByteCharSetProber : public nsCharSetProber{
public:
nsSingleByteCharSetProber(const SequenceModel *model)
:mModel(model), mReversed(PR_FALSE), mNameProber(0) { Reset(); }
nsSingleByteCharSetProber(const SequenceModel *model, PRBool reversed, nsCharSetProber* nameProber)
:mModel(model), mReversed(reversed), mNameProber(nameProber) { Reset(); }

View File

@ -55,7 +55,6 @@ nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter)
{
mNbspFound = PR_FALSE;
mDone = PR_FALSE;
mBestGuess = -1; //illegal value as signal
mInTag = PR_FALSE;
mEscCharSetProber = nsnull;
@ -86,7 +85,6 @@ nsUniversalDetector::Reset()
{
mNbspFound = PR_FALSE;
mDone = PR_FALSE;
mBestGuess = -1; //illegal value as signal
mInTag = PR_FALSE;
mStart = PR_TRUE;
@ -365,15 +363,14 @@ void nsUniversalDetector::DataEnd()
{
case eHighbyte:
{
float proberConfidence;
float maxProberConfidence = (float)0.0;
PRInt32 maxProber = 0;
float maxProberConfidence = 0.0f;
PRInt32 maxProber = -1;
for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; ++i)
{
if (mCharSetProbers[i])
{
proberConfidence = mCharSetProbers[i]->GetConfidence();
float const proberConfidence = mCharSetProbers[i]->GetConfidence();
if (proberConfidence > maxProberConfidence)
{
maxProberConfidence = proberConfidence;
@ -382,10 +379,10 @@ void nsUniversalDetector::DataEnd()
}
}
mDetectedConfidence = maxProberConfidence;
//do not report anything because we are not confident of it, that's in fact a negative answer
if (maxProberConfidence > MINIMUM_THRESHOLD) {
Report(mCharSetProbers[maxProber]->GetCharSetName(), mCharSetProbers[maxProber]->GetConfidence());
mDetectedConfidence = mCharSetProbers[maxProber]->GetConfidence();
if ((maxProber >= 0) && (maxProberConfidence > MINIMUM_THRESHOLD)) {
Report(mCharSetProbers[maxProber]->GetCharSetName(), maxProberConfidence);
}
}
break;

View File

@ -86,7 +86,6 @@ protected:
const char * mDetectedCharset;
float mDetectedConfidence;
short mDetectedIsBOM;
PRInt32 mBestGuess;
PRUint32 mLanguageFilter;
nsCharSetProber *mCharSetProbers[NUM_OF_CHARSET_PROBERS];