+ chg: Increase encoding detection confidence threshold to 92%

This commit is contained in:
Rainer Kottenhoff 2020-01-31 10:07:58 +01:00
parent 6c32c040b0
commit 8c75d9537e
7 changed files with 120 additions and 143 deletions

View File

@ -49,7 +49,7 @@ SettingsVersion=4
;UndoTransactionTimeout=0
;AdministrationTool.exe=
;DevDebugMode=0
;AnalyzeReliableConfidenceLevel=70
;AnalyzeReliableConfidenceLevel=92
;LexerSQLNumberSignAsComment=1
;ExitOnESCSkipLevel=2
[Statusbar Settings]

View File

@ -801,7 +801,7 @@ void LoadSettings()
Settings2.NoCutLineOnEmptySelection = IniSectionGetBool(IniSecSettings2, L"NoCutLineOnEmptySelection", Defaults2.NoCutLineOnEmptySelection);
int const iARCLdef = 70;
int const iARCLdef = 92;
Defaults2.AnalyzeReliableConfidenceLevel = (float)iARCLdef / 100.0f;
int const iARCLset = clampi(IniSectionGetInt(IniSecSettings2, L"AnalyzeReliableConfidenceLevel", iARCLdef), 0, 100);
Settings2.AnalyzeReliableConfidenceLevel = (float)iARCLset / 100.0f;

View File

@ -2581,133 +2581,112 @@ static INT_PTR CALLBACK SelectDefEncodingDlgProc(HWND hwnd, UINT umsg, WPARAM wP
switch (umsg)
{
case WM_INITDIALOG:
{
SetWindowLongPtr(hwnd, DWLP_USER, (LONG_PTR)lParam);
if (Globals.hDlgIcon) { SendMessage(hwnd, WM_SETICON, ICON_SMALL, (LPARAM)Globals.hDlgIcon); }
case WM_INITDIALOG:
{
SetWindowLongPtr(hwnd, DWLP_USER, (LONG_PTR)lParam);
if (Globals.hDlgIcon) { SendMessage(hwnd, WM_SETICON, ICON_SMALL, (LPARAM)Globals.hDlgIcon); }
PENCODEDLG const pdd = (PENCODEDLG)lParam;
HBITMAP hbmp = LoadImage(Globals.hInstance, MAKEINTRESOURCE(IDB_ENCODING), IMAGE_BITMAP, 0, 0, LR_CREATEDIBSECTION);
hbmp = ResizeImageForCurrentDPI(hbmp);
PENCODEDLG const pdd = (PENCODEDLG)lParam;
HBITMAP hbmp = LoadImage(Globals.hInstance, MAKEINTRESOURCE(IDB_ENCODING), IMAGE_BITMAP, 0, 0, LR_CREATEDIBSECTION);
hbmp = ResizeImageForCurrentDPI(hbmp);
HIMAGELIST himl = ImageList_Create(16, 16, ILC_COLOR32 | ILC_MASK, 0, 0);
ImageList_AddMasked(himl, hbmp, CLR_DEFAULT);
DeleteObject(hbmp);
SendDlgItemMessage(hwnd, IDC_ENCODINGLIST, CBEM_SETIMAGELIST, 0, (LPARAM)himl);
SendDlgItemMessage(hwnd, IDC_ENCODINGLIST, CB_SETEXTENDEDUI, true, 0);
HIMAGELIST himl = ImageList_Create(16, 16, ILC_COLOR32 | ILC_MASK, 0, 0);
ImageList_AddMasked(himl, hbmp, CLR_DEFAULT);
DeleteObject(hbmp);
SendDlgItemMessage(hwnd, IDC_ENCODINGLIST, CBEM_SETIMAGELIST, 0, (LPARAM)himl);
SendDlgItemMessage(hwnd, IDC_ENCODINGLIST, CB_SETEXTENDEDUI, true, 0);
Encoding_AddToComboboxEx(GetDlgItem(hwnd, IDC_ENCODINGLIST), pdd->idEncoding, 0);
Encoding_AddToComboboxEx(GetDlgItem(hwnd, IDC_ENCODINGLIST), pdd->idEncoding, 0);
Encoding_GetFromComboboxEx(GetDlgItem(hwnd, IDC_ENCODINGLIST), &s_iEnc);
s_bLoadASCIIasUTF8 = Settings.LoadASCIIasUTF8;
s_bUseAsFallback = Encoding_IsASCII(s_iEnc) ? Settings.UseDefaultForFileEncoding : false;
CheckDlgButton(hwnd, IDC_USEASREADINGFALLBACK, SetBtn(s_bUseAsFallback));
CheckDlgButton(hwnd, IDC_ASCIIASUTF8, SetBtn(s_bLoadASCIIasUTF8));
CheckDlgButton(hwnd, IDC_RELIABLE_DETECTION_RES, SetBtn(Settings.UseReliableCEDonly));
CheckDlgButton(hwnd, IDC_NFOASOEM, SetBtn(Settings.LoadNFOasOEM));
CheckDlgButton(hwnd, IDC_ENCODINGFROMFILEVARS, SetBtn(!Settings.NoEncodingTags));
CheckDlgButton(hwnd, IDC_NOUNICODEDETECTION, SetBtn(!Settings.SkipUnicodeDetection));
CheckDlgButton(hwnd, IDC_NOANSICPDETECTION, SetBtn(!Settings.SkipANSICodePageDetection));
DialogEnableControl(hwnd, IDC_USEASREADINGFALLBACK, Encoding_IsASCII(s_iEnc));
CenterDlgInParent(hwnd, NULL);
}
return true;
case WM_DPICHANGED:
UpdateWindowLayoutForDPI(hwnd, 0, 0, 0, 0);
return true;
case WM_COMMAND:
switch (LOWORD(wParam))
{
case IDC_ASCIIASUTF8:
if (s_iEnc != CPI_UTF8) {
s_bLoadASCIIasUTF8 = IsButtonChecked(hwnd, IDC_ASCIIASUTF8);
}
break;
case IDC_USEASREADINGFALLBACK:
if (s_iEnc != CPI_ANSI_DEFAULT) {
s_bUseAsFallback = IsButtonChecked(hwnd, IDC_USEASREADINGFALLBACK);
}
if (s_iEnc == CPI_UTF8) {
if (s_bUseAsFallback) {
CheckDlgButton(hwnd, IDC_ASCIIASUTF8, SetBtn(true));
DialogEnableControl(hwnd, IDC_ASCIIASUTF8, false);
}
else
{
DialogEnableControl(hwnd, IDC_ASCIIASUTF8, true);
CheckDlgButton(hwnd, IDC_ASCIIASUTF8, SetBtn(s_bLoadASCIIasUTF8));
}
}
break;
case IDC_ENCODINGLIST:
{
Encoding_GetFromComboboxEx(GetDlgItem(hwnd, IDC_ENCODINGLIST), &s_iEnc);
s_bUseAsFallback = Encoding_IsASCII(s_iEnc) ? Settings.UseDefaultForFileEncoding : false;
if (s_iEnc == CPI_UTF8) {
if (s_bUseAsFallback) {
DialogEnableControl(hwnd, IDC_ASCIIASUTF8, false);
CheckDlgButton(hwnd, IDC_ASCIIASUTF8, SetBtn(true));
}
DialogEnableControl(hwnd, IDC_USEASREADINGFALLBACK, Encoding_IsASCII(s_iEnc));
CheckDlgButton(hwnd, IDC_USEASREADINGFALLBACK, SetBtn(s_bUseAsFallback));
}
else if (s_iEnc == CPI_ANSI_DEFAULT) {
DialogEnableControl(hwnd, IDC_ASCIIASUTF8, true);
CheckDlgButton(hwnd, IDC_ASCIIASUTF8, SetBtn(s_bLoadASCIIasUTF8));
s_bUseAsFallback = true;
DialogEnableControl(hwnd, IDC_USEASREADINGFALLBACK, false);
CheckDlgButton(hwnd, IDC_USEASREADINGFALLBACK, SetBtn(s_bUseAsFallback));
}
else {
Encoding_GetFromComboboxEx(GetDlgItem(hwnd, IDC_ENCODINGLIST), &s_iEnc);
s_bLoadASCIIasUTF8 = Settings.LoadASCIIasUTF8;
s_bUseAsFallback = Encoding_IsASCII(s_iEnc) ? Settings.UseDefaultForFileEncoding : false;
DialogEnableControl(hwnd, IDC_ASCIIASUTF8, true);
CheckDlgButton(hwnd, IDC_ASCIIASUTF8, SetBtn(s_bLoadASCIIasUTF8));
DialogEnableControl(hwnd, IDC_USEASREADINGFALLBACK, Encoding_IsASCII(s_iEnc));
CheckDlgButton(hwnd, IDC_USEASREADINGFALLBACK, SetBtn(s_bUseAsFallback));
}
}
break;
case IDOK: {
PENCODEDLG pdd = (PENCODEDLG)GetWindowLongPtr(hwnd, DWLP_USER);
if (Encoding_GetFromComboboxEx(GetDlgItem(hwnd, IDC_ENCODINGLIST), &pdd->idEncoding)) {
if (pdd->idEncoding < 0) {
InfoBoxLng(MB_ICONWARNING, NULL, IDS_MUI_ERR_ENCODINGNA);
CheckDlgButton(hwnd, IDC_ASCIIASUTF8, SetBtn(s_bLoadASCIIasUTF8));
CheckDlgButton(hwnd, IDC_RELIABLE_DETECTION_RES, SetBtn(Settings.UseReliableCEDonly));
CheckDlgButton(hwnd, IDC_NFOASOEM, SetBtn(Settings.LoadNFOasOEM));
CheckDlgButton(hwnd, IDC_ENCODINGFROMFILEVARS, SetBtn(!Settings.NoEncodingTags));
CheckDlgButton(hwnd, IDC_NOUNICODEDETECTION, SetBtn(!Settings.SkipUnicodeDetection));
CheckDlgButton(hwnd, IDC_NOANSICPDETECTION, SetBtn(!Settings.SkipANSICodePageDetection));
CenterDlgInParent(hwnd, NULL);
}
return true;
case WM_DPICHANGED:
UpdateWindowLayoutForDPI(hwnd, 0, 0, 0, 0);
return true;
case WM_COMMAND:
switch (LOWORD(wParam))
{
case IDC_ENCODINGLIST:
case IDC_USEASREADINGFALLBACK:
case IDC_ASCIIASUTF8:
{
Encoding_GetFromComboboxEx(GetDlgItem(hwnd, IDC_ENCODINGLIST), &s_iEnc);
s_bUseAsFallback = Encoding_IsASCII(s_iEnc) ? IsButtonChecked(hwnd, IDC_USEASREADINGFALLBACK) : false;
s_bLoadASCIIasUTF8 = IsButtonChecked(hwnd, IDC_ASCIIASUTF8);
DialogEnableControl(hwnd, IDC_USEASREADINGFALLBACK, Encoding_IsASCII(s_iEnc));
CheckDlgButton(hwnd, IDC_USEASREADINGFALLBACK, SetBtn(s_bUseAsFallback));
DialogEnableControl(hwnd, IDC_ASCIIASUTF8, true);
CheckDlgButton(hwnd, IDC_ASCIIASUTF8, SetBtn(s_bLoadASCIIasUTF8));
if (s_iEnc == CPI_UTF8) {
if (s_bUseAsFallback) {
s_bLoadASCIIasUTF8 = true;
DialogEnableControl(hwnd, IDC_ASCIIASUTF8, false);
CheckDlgButton(hwnd, IDC_ASCIIASUTF8, SetBtn(s_bLoadASCIIasUTF8));
}
}
else if (s_iEnc == CPI_ANSI_DEFAULT) {
if (s_bUseAsFallback) {
s_bLoadASCIIasUTF8 = false;
DialogEnableControl(hwnd, IDC_ASCIIASUTF8, false);
CheckDlgButton(hwnd, IDC_ASCIIASUTF8, SetBtn(s_bLoadASCIIasUTF8));
}
}
}
break;
case IDOK: {
PENCODEDLG pdd = (PENCODEDLG)GetWindowLongPtr(hwnd, DWLP_USER);
if (Encoding_GetFromComboboxEx(GetDlgItem(hwnd, IDC_ENCODINGLIST), &pdd->idEncoding)) {
if (pdd->idEncoding < 0) {
InfoBoxLng(MB_ICONWARNING, NULL, IDS_MUI_ERR_ENCODINGNA);
EndDialog(hwnd, IDCANCEL);
}
else {
Settings.UseDefaultForFileEncoding = IsButtonChecked(hwnd, IDC_USEASREADINGFALLBACK);
Settings.LoadASCIIasUTF8 = IsButtonChecked(hwnd, IDC_ASCIIASUTF8);
Settings.UseReliableCEDonly = IsButtonChecked(hwnd, IDC_RELIABLE_DETECTION_RES);
Settings.LoadNFOasOEM = IsButtonChecked(hwnd, IDC_NFOASOEM);
Settings.NoEncodingTags = !IsButtonChecked(hwnd, IDC_ENCODINGFROMFILEVARS);
Settings.SkipUnicodeDetection = !IsButtonChecked(hwnd, IDC_NOUNICODEDETECTION);
Settings.SkipANSICodePageDetection = !IsButtonChecked(hwnd, IDC_NOANSICPDETECTION);
EndDialog(hwnd, IDOK);
}
}
else {
PostMessage(hwnd, WM_NEXTDLGCTL, (WPARAM)(GetDlgItem(hwnd, IDC_ENCODINGLIST)), 1);
}
}
break;
case IDCANCEL:
EndDialog(hwnd, IDCANCEL);
}
else {
Settings.UseDefaultForFileEncoding = IsButtonChecked(hwnd, IDC_USEASREADINGFALLBACK);
Settings.LoadASCIIasUTF8 = IsButtonChecked(hwnd, IDC_ASCIIASUTF8);
Settings.UseReliableCEDonly = IsButtonChecked(hwnd, IDC_RELIABLE_DETECTION_RES);
Settings.LoadNFOasOEM = IsButtonChecked(hwnd, IDC_NFOASOEM);
Settings.NoEncodingTags = !IsButtonChecked(hwnd, IDC_ENCODINGFROMFILEVARS);
Settings.SkipUnicodeDetection = !IsButtonChecked(hwnd, IDC_NOUNICODEDETECTION);
Settings.SkipANSICodePageDetection = !IsButtonChecked(hwnd, IDC_NOANSICPDETECTION);
EndDialog(hwnd, IDOK);
}
break;
}
else {
PostMessage(hwnd, WM_NEXTDLGCTL, (WPARAM)(GetDlgItem(hwnd, IDC_ENCODINGLIST)), 1);
}
}
break;
case IDCANCEL:
EndDialog(hwnd, IDCANCEL);
break;
}
return true;
return true;
}
return false;
}

View File

@ -979,6 +979,10 @@ bool EditLoadFile(
bool bClearUndoHistory,
EditFileIOStatus* status)
{
cpi_enc_t const iEncFallback = Settings.UseDefaultForFileEncoding ?
Settings.DefaultEncoding : (Settings.LoadASCIIasUTF8 ? CPI_UTF8 : CPI_ANSI_DEFAULT);
status->iEncoding = iEncFallback;
status->bUnicodeErr = false;
status->bFileTooBig = false;
status->bUnknownExt = false;
@ -1062,8 +1066,8 @@ bool EditLoadFile(
if (cbData == 0) {
FileVars_Init(NULL, 0, &Globals.fvCurFile);
status->iEncoding = iEncFallback;
status->iEOLMode = Settings.DefaultEOLMode;
status->iEncoding = Settings.LoadASCIIasUTF8 ? CPI_UTF8 : CPI_ANSI_DEFAULT;
EditSetNewText(hwnd, "", 0, bClearUndoHistory);
SciCall_SetEOLMode(Settings.DefaultEOLMode);
Encoding_Forced(CPI_NONE);
@ -1091,12 +1095,7 @@ bool EditLoadFile(
return false;
}
bool const bValidUTF8 = IsValidUTF8(lpData, cbData);
cpi_enc_t const iAnalyzeFallback = Settings.UseDefaultForFileEncoding ? Settings.DefaultEncoding :
((bValidUTF8 && Settings.LoadASCIIasUTF8) ? CPI_UTF8 : CPI_ANSI_DEFAULT);
ENC_DET_T encDetection = Encoding_DetectEncoding(pszFile, lpData, cbData, iAnalyzeFallback,
ENC_DET_T encDetection = Encoding_DetectEncoding(pszFile, lpData, cbData, iEncFallback,
bSkipUTFDetection, bSkipANSICPDetection, bForceEncDetection);
#define IS_ENC_ENFORCED() (!Encoding_IsNONE(encDetection.forcedEncoding))
@ -1168,6 +1167,7 @@ bool EditLoadFile(
else // === ALL OTHERS ===
{
// === UTF-8 ? ===
bool const bValidUTF8 = IsValidUTF8(lpData, cbData);
bool const bForcedUTF8 = Encoding_IsUTF8(encDetection.forcedEncoding);// ~ don't || encDetection.bIsUTF8Sig here !
bool const bAnalysisUTF8 = Encoding_IsUTF8(encDetection.Encoding);
@ -1191,7 +1191,7 @@ bool EditLoadFile(
{
// load UTF-7/ASCII(7-bit) as ANSI/UTF-8
EditSetNewText(hwnd, lpData, cbData, bClearUndoHistory);
status->iEncoding = Settings.LoadASCIIasUTF8 ? CPI_UTF8 : CPI_ANSI_DEFAULT;
status->iEncoding = iEncFallback;
EditDetectEOLMode(lpData, cbData, status);
}
else { // === ALL OTHER NON UTF-8 ===

View File

@ -533,6 +533,11 @@ bool Encoding_IsUTF8_SIGN(const cpi_enc_t iEncoding) {
}
// ============================================================================
bool Encoding_IsUTF8_NO_SIGN(const cpi_enc_t iEncoding) {
return (Encoding_IsUTF8(iEncoding) && !Encoding_IsUTF8_SIGN(iEncoding));
}
// ============================================================================
bool Encoding_IsMBCS(const cpi_enc_t iEncoding) {
return (iEncoding >= 0) ? (g_Encodings[iEncoding].uFlags & NCP_MBCS) : false;
}

View File

@ -94,6 +94,7 @@ bool Encoding_IsANSI(const cpi_enc_t iEncoding);
bool Encoding_IsOEM(const cpi_enc_t iEncoding);
bool Encoding_IsUTF8(const cpi_enc_t iEncoding);
bool Encoding_IsUTF8_SIGN(const cpi_enc_t iEncoding);
bool Encoding_IsUTF8_NO_SIGN(const cpi_enc_t iEncoding);
bool Encoding_IsMBCS(const cpi_enc_t iEncoding);
bool Encoding_IsCJK(const cpi_enc_t iEncoding);
bool Encoding_IsUNICODE(const cpi_enc_t iEncoding);
@ -174,6 +175,7 @@ typedef struct _enc_det_t
cpi_enc_t fileVarEncoding;
cpi_enc_t analyzedEncoding;
cpi_enc_t unicodeAnalysis;
float confidence;
// flags:
bool bIsAnalysisReliable;
bool bHasBOM;

View File

@ -1265,7 +1265,7 @@ extern "C" ENC_DET_T Encoding_DetectEncoding(LPWSTR pszFile, const char* lpData,
size_t const cbNbytes4Analysis = min_s(cbData, 200000LL);
float confidence = 0.0f;
encDetRes.confidence = 0.0f;
cpi_enc_t const asciiEnc = Settings.LoadASCIIasUTF8 ? CPI_UTF8 : CPI_ANSI_DEFAULT;
@ -1273,13 +1273,16 @@ extern "C" ENC_DET_T Encoding_DetectEncoding(LPWSTR pszFile, const char* lpData,
{
if (!bSkipANSICPDetection)
{
encDetRes.analyzedEncoding = Encoding_AnalyzeText(lpData, cbNbytes4Analysis, &confidence, iAnalyzeFallback);
encDetRes.analyzedEncoding = Encoding_AnalyzeText(lpData, cbNbytes4Analysis, &encDetRes.confidence, iAnalyzeFallback);
}
if (encDetRes.analyzedEncoding == CPI_NONE)
{
encDetRes.analyzedEncoding = iAnalyzeFallback;
confidence = (1.0f - Settings2.AnalyzeReliableConfidenceLevel);
encDetRes.confidence = (1.0f - Settings2.AnalyzeReliableConfidenceLevel);
}
else if (encDetRes.analyzedEncoding == CPI_ASCII_7BIT) {
encDetRes.analyzedEncoding = asciiEnc;
}
if (!bSkipUTFDetection)
@ -1317,19 +1320,7 @@ extern "C" ENC_DET_T Encoding_DetectEncoding(LPWSTR pszFile, const char* lpData,
//bool const bIsUTF8orUnicodeAnalysis = Encoding_IsUTF8(encDetRes.analyzedEncoding) || Encoding_IsUNICODE(encDetRes.analyzedEncoding);
if (!IS_ENC_ENFORCED())
{
if (encDetRes.analyzedEncoding == CPI_NONE)
{
encDetRes.analyzedEncoding = iAnalyzeFallback;
confidence = (1.0f - Settings2.AnalyzeReliableConfidenceLevel);
}
else if (encDetRes.analyzedEncoding == CPI_ASCII_7BIT) {
encDetRes.analyzedEncoding = asciiEnc;
}
}
int const iConfidence = float2int(confidence * 100.0f);
int const iConfidence = float2int(encDetRes.confidence * 100.0f);
int const iReliableThreshold = float2int(Settings2.AnalyzeReliableConfidenceLevel * 100.0f);
encDetRes.bIsAnalysisReliable = (iConfidence >= iReliableThreshold);