From 88ef77c5ef3f15f204890a7107c91639de60b749 Mon Sep 17 00:00:00 2001 From: Rainer Kottenhoff Date: Wed, 28 Feb 2018 22:47:14 +0100 Subject: [PATCH 1/6] + refactoring: prpare encoding detector enhancement --- src/Edit.c | 106 +++++++++-------- src/Edit.h | 2 + src/Helpers.c | 307 ++++++++++++++++++++++++------------------------- src/Helpers.h | 14 +-- src/Notepad3.c | 123 ++++++++++---------- src/Styles.c | 8 +- 6 files changed, 281 insertions(+), 279 deletions(-) diff --git a/src/Edit.c b/src/Edit.c index b979bc2a8..21799bc8c 100644 --- a/src/Edit.c +++ b/src/Edit.c @@ -75,14 +75,14 @@ static int yFindReplaceDlgSave; extern int xFindReplaceDlg; extern int yFindReplaceDlg; -extern int iDefaultEOLMode; +extern int g_iDefaultEOLMode; extern int iLineEndings[3]; extern BOOL bFixLineEndings; extern BOOL bAutoStripBlanks; // Default Codepage and Character Set -extern int iDefaultEncoding; -extern int iDefaultCharSet; +extern int g_iDefaultEncoding; +extern int g_iDefaultCharSet; extern BOOL bLoadASCIIasUTF8; extern BOOL bLoadNFOasOEM; @@ -193,8 +193,8 @@ HWND EditCreate(HWND hwndParent) g_hInstance, NULL); - Encoding_Current(iDefaultEncoding); - Encoding_SciSetCodePage(hwnd,iDefaultEncoding); + Encoding_Current(g_iDefaultEncoding); + Encoding_SciSetCodePage(hwnd,g_iDefaultEncoding); SendMessage(hwnd,SCI_SETEOLMODE,SC_EOL_CRLF,0); SendMessage(hwnd,SCI_SETPASTECONVERTENDINGS,TRUE,0); SendMessage(hwnd,SCI_SETMODEVENTMASK,/*SC_MODEVENTMASKALL*/SC_MOD_INSERTTEXT|SC_MOD_DELETETEXT|SC_MOD_CONTAINER,0); @@ -939,7 +939,7 @@ BOOL EditCopyAppend(HWND hwnd, BOOL bAppend) // int EditDetectEOLMode(HWND hwnd,char* lpData,DWORD cbData) { - int iEOLMode = iLineEndings[iDefaultEOLMode]; + int iEOLMode = iLineEndings[g_iDefaultEOLMode]; char *cp = (char*)lpData; if (!cp) @@ -993,7 +993,7 @@ BOOL EditLoadFile( dwLastIOError = GetLastError(); if (hFile == INVALID_HANDLE_VALUE) { - Encoding_Source(CPI_NONE); + Encoding_SrcCmdLn(CPI_NONE); Encoding_SrcWeak(CPI_NONE); return FALSE; } @@ -1009,7 +1009,7 @@ BOOL EditLoadFile( CloseHandle(hFile); if (pbUnkownExt) *pbUnkownExt = TRUE; - Encoding_Source(CPI_NONE); + Encoding_SrcCmdLn(CPI_NONE); Encoding_SrcWeak(CPI_NONE); return FALSE; } @@ -1022,7 +1022,7 @@ BOOL EditLoadFile( CloseHandle(hFile); if (pbFileTooBig) *pbFileTooBig = TRUE; - Encoding_Source(CPI_NONE); + Encoding_SrcCmdLn(CPI_NONE); Encoding_SrcWeak(CPI_NONE); return FALSE; } @@ -1036,7 +1036,7 @@ BOOL EditLoadFile( CloseHandle(hFile); if (pbFileTooBig) *pbFileTooBig = FALSE; - Encoding_Source(CPI_NONE); + Encoding_SrcCmdLn(CPI_NONE); Encoding_SrcWeak(CPI_NONE); return FALSE; } @@ -1048,7 +1048,7 @@ BOOL EditLoadFile( if (!bReadSuccess) { GlobalFree(lpData); - Encoding_Source(CPI_NONE); + Encoding_SrcCmdLn(CPI_NONE); Encoding_SrcWeak(CPI_NONE); return FALSE; } @@ -1060,50 +1060,47 @@ BOOL EditLoadFile( bPreferOEM = TRUE; } - int _iPrefEncoding = (bPreferOEM) ? g_DOSEncoding : iDefaultEncoding; + const int iFileEncoding = Encoding_SrcCmdLn(CPI_GET); + const int iPreferedEncoding = (bPreferOEM) ? g_DOSEncoding : + (Encoding_IsValid(Encoding_SrcWeak(CPI_GET)) ? Encoding_SrcWeak(CPI_GET) : g_iDefaultEncoding); - if (Encoding_IsValid(Encoding_SrcWeak(CPI_GET))) { - _iPrefEncoding = Encoding_SrcWeak(CPI_GET); - } BOOL bBOM = FALSE; BOOL bReverse = FALSE; - const int iSrcEnc = Encoding_Source(CPI_GET); - if (cbData == 0) { FileVars_Init(NULL,0,&fvCurFile); - *iEOLMode = iLineEndings[iDefaultEOLMode]; - if (iSrcEnc == CPI_NONE) { + *iEOLMode = iLineEndings[g_iDefaultEOLMode]; + if (iFileEncoding == CPI_NONE) { if (bLoadASCIIasUTF8 && !bPreferOEM) *iEncoding = CPI_UTF8; else - *iEncoding = _iPrefEncoding; + *iEncoding = iPreferedEncoding; } else - *iEncoding = iSrcEnc; + *iEncoding = iFileEncoding; Encoding_SciSetCodePage(hwnd,*iEncoding); EditSetNewText(hwnd,"",0); - SendMessage(hwnd,SCI_SETEOLMODE,iLineEndings[iDefaultEOLMode],0); + SendMessage(hwnd,SCI_SETEOLMODE,iLineEndings[g_iDefaultEOLMode],0); GlobalFree(lpData); } else if (!bSkipEncodingDetection && - (iSrcEnc == CPI_NONE || iSrcEnc == CPI_UNICODE || iSrcEnc == CPI_UNICODEBE) && - (iSrcEnc == CPI_UNICODE || iSrcEnc == CPI_UNICODEBE || IsUnicode(lpData,cbData,&bBOM,&bReverse)) && - (iSrcEnc == CPI_UNICODE || iSrcEnc == CPI_UNICODEBE || !IsUTF8Signature(lpData))) // check for UTF-8 signature + (iFileEncoding == CPI_NONE || iFileEncoding == CPI_UNICODE || iFileEncoding == CPI_UNICODEBE) && + (iFileEncoding == CPI_UNICODE || iFileEncoding == CPI_UNICODEBE || IsUnicode(lpData,cbData,&bBOM,&bReverse)) && + (iFileEncoding == CPI_UNICODE || iFileEncoding == CPI_UNICODEBE || !IsUTF8Signature(lpData))) // check for UTF-8 signature { char* lpDataUTF8; - if (iSrcEnc == CPI_UNICODE) { + if (iFileEncoding == CPI_UNICODE) { bBOM = (*((UNALIGNED PWCHAR)lpData) == 0xFEFF); bReverse = FALSE; } - else if (iSrcEnc == CPI_UNICODEBE) + else if (iFileEncoding == CPI_UNICODEBE) bBOM = (*((UNALIGNED PWCHAR)lpData) == 0xFFFE); - if (iSrcEnc == CPI_UNICODEBE || bReverse) { + if (iFileEncoding == CPI_UNICODEBE || bReverse) { _swab(lpData,lpData,cbData); if (bBOM) *iEncoding = CPI_UNICODEBEBOM; @@ -1142,7 +1139,7 @@ BOOL EditLoadFile( else { GlobalFree(lpDataUTF8); GlobalFree(lpData); - Encoding_Source(CPI_NONE); + Encoding_SrcCmdLn(CPI_NONE); Encoding_SrcWeak(CPI_NONE); return FALSE; } @@ -1150,18 +1147,18 @@ BOOL EditLoadFile( else { FileVars_Init(lpData,cbData,&fvCurFile); - if (!bSkipEncodingDetection && (iSrcEnc == CPI_NONE || iSrcEnc == CPI_UTF8 || iSrcEnc == CPI_UTF8SIGN) && + if (!bSkipEncodingDetection && (iFileEncoding == CPI_NONE || iFileEncoding == CPI_UTF8 || iFileEncoding == CPI_UTF8SIGN) && ((IsUTF8Signature(lpData) || FileVars_IsUTF8(&fvCurFile) || - (iSrcEnc == CPI_UTF8 || iSrcEnc == CPI_UTF8SIGN) || + (iFileEncoding == CPI_UTF8 || iFileEncoding == CPI_UTF8SIGN) || (!bPreferOEM && bLoadASCIIasUTF8) || // from menu "Reload As UTF-8" (IsUTF8(lpData,cbData) && (((UTF8_mbslen_bytes(UTF8StringStart(lpData)) - 1 != UTF8_mbslen(UTF8StringStart(lpData),IsUTF8Signature(lpData) ? cbData-3 : cbData)) || (!bPreferOEM && ( - g_Encodings[_iPrefEncoding].uFlags & NCP_UTF8 || + g_Encodings[iPreferedEncoding].uFlags & NCP_UTF8 || bLoadASCIIasUTF8))))))) && !(FileVars_IsNonUTF8(&fvCurFile) && - (iSrcEnc != CPI_UTF8 && iSrcEnc != CPI_UTF8SIGN))) + (iFileEncoding != CPI_UTF8 && iFileEncoding != CPI_UTF8SIGN))) { Encoding_SciSetCodePage(hwnd,CPI_UTF8); EditSetNewText(hwnd,"",0); @@ -1179,25 +1176,29 @@ BOOL EditLoadFile( } else { - if (iSrcEnc != CPI_NONE) - *iEncoding = iSrcEnc; + if (iFileEncoding != CPI_NONE) + *iEncoding = iFileEncoding; else { *iEncoding = FileVars_GetEncoding(&fvCurFile); if (*iEncoding == CPI_NONE) { if (fvCurFile.mask & FV_ENCODING) *iEncoding = CPI_ANSI_DEFAULT; else { - if (Encoding_SrcWeak(CPI_GET) == CPI_NONE) - *iEncoding = _iPrefEncoding; - else if (g_Encodings[Encoding_SrcWeak(CPI_GET)].uFlags & NCP_INTERNAL) - *iEncoding = iDefaultEncoding; - else - *iEncoding = _iPrefEncoding; + int iEncWeak = Encoding_SrcWeak(CPI_GET); + switch (iEncWeak) { + case CPI_NONE: + *iEncoding = iPreferedEncoding; + break; + default: + *iEncoding = (g_Encodings[iEncWeak].uFlags & NCP_INTERNAL) ? g_iDefaultEncoding : + (Encoding_IsValid(iEncWeak) ? iEncWeak : iPreferedEncoding); + break; + } } } } - if (((g_Encodings[*iEncoding].uCodePage != CP_UTF7) && (g_Encodings[*iEncoding].uFlags & NCP_8BIT)) || + if (((g_Encodings[*iEncoding].uCodePage != CP_UTF7) && (g_Encodings[*iEncoding].uFlags & NCP_EXTERNAL_8BIT)) || ((g_Encodings[*iEncoding].uCodePage == CP_UTF7) && IsUTF7(lpData,cbData))) { UINT uCodePage = g_Encodings[*iEncoding].uCodePage; @@ -1221,7 +1222,7 @@ BOOL EditLoadFile( else { GlobalFree(lpDataWide); GlobalFree(lpData); - Encoding_Source(CPI_NONE); + Encoding_SrcCmdLn(CPI_NONE); Encoding_SrcWeak(CPI_NONE); return FALSE; } @@ -1229,13 +1230,13 @@ BOOL EditLoadFile( else { GlobalFree(lpDataWide); GlobalFree(lpData); - Encoding_Source(CPI_NONE); + Encoding_SrcCmdLn(CPI_NONE); Encoding_SrcWeak(CPI_NONE); return FALSE; } } else { - *iEncoding = Encoding_IsValid(iSrcEnc) ? iSrcEnc : iDefaultEncoding; + *iEncoding = Encoding_IsValid(iFileEncoding) ? iFileEncoding : g_iDefaultEncoding; Encoding_SciSetCodePage(hwnd,*iEncoding); EditSetNewText(hwnd,"",0); EditSetNewText(hwnd,lpData,cbData); @@ -1245,7 +1246,7 @@ BOOL EditLoadFile( } } - Encoding_Source(CPI_NONE); + Encoding_SrcCmdLn(CPI_NONE); Encoding_SrcWeak(CPI_NONE); return TRUE; @@ -1389,7 +1390,7 @@ BOOL EditSaveFile( GlobalFree(lpData); } - else if (g_Encodings[iEncoding].uFlags & NCP_8BIT) { + else if (g_Encodings[iEncoding].uFlags & NCP_EXTERNAL_8BIT) { BOOL bCancelDataLoss = FALSE; UINT uCodePage = g_Encodings[iEncoding].uCodePage; @@ -1793,7 +1794,7 @@ void EditEscapeCChars(HWND hwnd) { return; } - EDITFINDREPLACE efr = { "", "", "", "", 0, 0, 0, 0, 0, 0, 0, 0, 0, NULL }; + EDITFINDREPLACE efr = EFR_INIT_DATA; efr.hwnd = hwnd; StringCchCopyA(efr.szFind,FNDRPL_BUFFER,"\\"); @@ -1825,7 +1826,7 @@ void EditUnescapeCChars(HWND hwnd) { return; } - EDITFINDREPLACE efr = { "", "", "", "", 0, 0, 0, 0, 0, 0, 0, 0, 0, NULL }; + EDITFINDREPLACE efr = EFR_INIT_DATA; efr.hwnd = hwnd; StringCchCopyA(efr.szFind,FNDRPL_BUFFER,"\\\\"); @@ -7069,7 +7070,7 @@ void EditSetBookmarkList(HWND hwnd, LPCWSTR pszBookMarks) //============================================================================= // -// FileVars_Init() +// SetFileVars() // extern BOOL bNoEncodingTags; extern int flagNoFileVariables; @@ -7134,6 +7135,11 @@ void __fastcall SetFileVars(char* lpData, char* tch, LPFILEVARS lpfv) } } +//============================================================================= +// +// FileVars_Init() +// + BOOL FileVars_Init(char *lpData, DWORD cbData, LPFILEVARS lpfv) { char tch[LARGE_BUFFER]; diff --git a/src/Edit.h b/src/Edit.h index 837148f39..654a5b40d 100644 --- a/src/Edit.h +++ b/src/Edit.h @@ -42,6 +42,8 @@ typedef struct _editfindreplace } EDITFINDREPLACE, *LPEDITFINDREPLACE, *LPCEDITFINDREPLACE; +#define EFR_INIT_DATA { "", "", "", "", 0, 0, 0, 0, 0, 0, 0, 0, 0, NULL } + #define IDMSG_SWITCHTOFIND 300 #define IDMSG_SWITCHTOREPLACE 301 diff --git a/src/Helpers.c b/src/Helpers.c index 9c766c7d6..e8d4271e9 100644 --- a/src/Helpers.c +++ b/src/Helpers.c @@ -2768,144 +2768,144 @@ WCHAR wchANSI[16] = { L'\0' }; WCHAR wchOEM[16] = { L'\0' }; NP2ENCODING g_Encodings[] = { - { NCP_ANSI | NCP_RECODE, CP_ACP, "ansi,system,ascii,", 61000, L"" }, - { NCP_OEM | NCP_RECODE, CP_OEMCP, "oem,oem,", 61001, L"" }, - { NCP_UNICODE | NCP_UNICODE_BOM, CP_UTF8, "", 61002, L"" }, - { NCP_UNICODE | NCP_UNICODE_REVERSE | NCP_UNICODE_BOM, CP_UTF8, "", 61003, L"" }, - { NCP_UNICODE | NCP_RECODE, CP_UTF8, "utf-16,utf16,unicode,", 61004, L"" }, - { NCP_UNICODE | NCP_UNICODE_REVERSE | NCP_RECODE, CP_UTF8, "utf-16be,utf16be,unicodebe,", 61005, L"" }, - { NCP_UTF8 | NCP_RECODE, CP_UTF8, "utf-8,utf8,", 61006, L"" }, - { NCP_UTF8 | NCP_UTF8_SIGN, CP_UTF8, "utf-8,utf8,", 61007, L"" }, - { NCP_8BIT | NCP_RECODE, CP_UTF7, "utf-7,utf7,", 61008, L"" }, - { NCP_8BIT | NCP_RECODE, 720, "DOS-720,dos720,", 61009, L"" }, - { NCP_8BIT | NCP_RECODE, 28596, "iso-8859-6,iso88596,arabic,csisolatinarabic,ecma114,isoir127,", 61010, L"" }, - { NCP_8BIT | NCP_RECODE, 10004, "x-mac-arabic,xmacarabic,", 61011, L"" }, - { NCP_8BIT | NCP_RECODE, 1256, "windows-1256,windows1256,cp1256", 61012, L"" }, - { NCP_8BIT | NCP_RECODE, 775, "ibm775,ibm775,cp500,", 61013, L"" }, - { NCP_8BIT | NCP_RECODE, 28594, "iso-8859-4,iso88594,csisolatin4,isoir110,l4,latin4,", 61014, L"" }, - { NCP_8BIT | NCP_RECODE, 1257, "windows-1257,windows1257,", 61015, L"" }, - { NCP_8BIT | NCP_RECODE, 852, "ibm852,ibm852,cp852,", 61016, L"" }, - { NCP_8BIT | NCP_RECODE, 28592, "iso-8859-2,iso88592,csisolatin2,isoir101,latin2,l2,", 61017, L"" }, - { NCP_8BIT | NCP_RECODE, 10029, "x-mac-ce,xmacce,", 61018, L"" }, - { NCP_8BIT | NCP_RECODE, 1250, "windows-1250,windows1250,xcp1250,", 61019, L"" }, - { NCP_8BIT | NCP_RECODE, 936, "gb2312,gb2312,chinese,cngb,csgb2312,csgb231280,gb231280,gbk,", 61020, L"" }, - { NCP_8BIT | NCP_RECODE, 10008, "x-mac-chinesesimp,xmacchinesesimp,", 61021, L"" }, - { NCP_8BIT | NCP_RECODE, 950, "big5,big5,cnbig5,csbig5,xxbig5,", 61022, L"" }, - { NCP_8BIT | NCP_RECODE, 10002, "x-mac-chinesetrad,xmacchinesetrad,", 61023, L"" }, - { NCP_8BIT | NCP_RECODE, 10082, "x-mac-croatian,xmaccroatian,", 61024, L"" }, - { NCP_8BIT | NCP_RECODE, 866, "cp866,cp866,ibm866,", 61025, L"" }, - { NCP_8BIT | NCP_RECODE, 28595, "iso-8859-5,iso88595,csisolatin5,csisolatincyrillic,cyrillic,isoir144,", 61026, L"" }, - { NCP_8BIT | NCP_RECODE, 20866, "koi8-r,koi8r,cskoi8r,koi,koi8,", 61027, L"" }, - { NCP_8BIT | NCP_RECODE, 21866, "koi8-u,koi8u,koi8ru,", 61028, L"" }, - { NCP_8BIT | NCP_RECODE, 10007, "x-mac-cyrillic,xmaccyrillic,", 61029, L"" }, - { NCP_8BIT | NCP_RECODE, 1251, "windows-1251,windows1251,xcp1251,", 61030, L"" }, - { NCP_8BIT | NCP_RECODE, 28603, "iso-8859-13,iso885913,", 61031, L"" }, - { NCP_8BIT | NCP_RECODE, 863, "ibm863,ibm863,", 61032, L"" }, - { NCP_8BIT | NCP_RECODE, 737, "ibm737,ibm737,", 61033, L"" }, - { NCP_8BIT | NCP_RECODE, 28597, "iso-8859-7,iso88597,csisolatingreek,ecma118,elot928,greek,greek8,isoir126,", 61034, L"" }, - { NCP_8BIT | NCP_RECODE, 10006, "x-mac-greek,xmacgreek,", 61035, L"" }, - { NCP_8BIT | NCP_RECODE, 1253, "windows-1253,windows1253,", 61036, L"" }, - { NCP_8BIT | NCP_RECODE, 869, "ibm869,ibm869,", 61037, L"" }, - { NCP_8BIT | NCP_RECODE, 862, "DOS-862,dos862,", 61038, L"" }, - { NCP_8BIT | NCP_RECODE, 38598, "iso-8859-8-i,iso88598i,logical,", 61039, L"" }, - { NCP_8BIT | NCP_RECODE, 28598, "iso-8859-8,iso88598,csisolatinhebrew,hebrew,isoir138,visual,", 61040, L"" }, - { NCP_8BIT | NCP_RECODE, 10005, "x-mac-hebrew,xmachebrew,", 61041, L"" }, - { NCP_8BIT | NCP_RECODE, 1255, "windows-1255,windows1255,", 61042, L"" }, - { NCP_8BIT | NCP_RECODE, 861, "ibm861,ibm861,", 61043, L"" }, - { NCP_8BIT | NCP_RECODE, 10079, "x-mac-icelandic,xmacicelandic,", 61044, L"" }, - { NCP_8BIT | NCP_RECODE, 10001, "x-mac-japanese,xmacjapanese,", 61045, L"" }, - { NCP_8BIT | NCP_RECODE, 932, "shift_jis,shiftjis,shiftjs,csshiftjis,cswindows31j,mskanji,xmscp932,xsjis,", 61046, L"" }, - { NCP_8BIT | NCP_RECODE, 10003, "x-mac-korean,xmackorean,", 61047, L"" }, - { NCP_8BIT | NCP_RECODE, 949, "windows-949,windows949,ksc56011987,csksc5601,euckr,isoir149,korean,ksc56011989", 61048, L"" }, - { NCP_8BIT | NCP_RECODE, 28593, "iso-8859-3,iso88593,latin3,isoir109,l3,", 61049, L"" }, - { NCP_8BIT | NCP_RECODE, 28605, "iso-8859-15,iso885915,latin9,l9,", 61050, L"" }, - { NCP_8BIT | NCP_RECODE, 865, "ibm865,ibm865,", 61051, L"" }, - { NCP_8BIT | NCP_RECODE, 437, "ibm437,ibm437,437,cp437,cspc8,codepage437,", 61052, L"" }, - { NCP_8BIT | NCP_RECODE, 858, "ibm858,ibm858,ibm00858,", 61053, L"" }, - { NCP_8BIT | NCP_RECODE, 860, "ibm860,ibm860,", 61054, L"" }, - { NCP_8BIT | NCP_RECODE, 10010, "x-mac-romanian,xmacromanian,", 61055, L"" }, - { NCP_8BIT | NCP_RECODE, 10021, "x-mac-thai,xmacthai,", 61056, L"" }, - { NCP_8BIT | NCP_RECODE, 874, "windows-874,windows874,dos874,iso885911,tis620,", 61057, L"" }, - { NCP_8BIT | NCP_RECODE, 857, "ibm857,ibm857,", 61058, L"" }, - { NCP_8BIT | NCP_RECODE, 28599, "iso-8859-9,iso88599,latin5,isoir148,l5,", 61059, L"" }, - { NCP_8BIT | NCP_RECODE, 10081, "x-mac-turkish,xmacturkish,", 61060, L"" }, - { NCP_8BIT | NCP_RECODE, 1254, "windows-1254,windows1254,", 61061, L"" }, - { NCP_8BIT | NCP_RECODE, 10017, "x-mac-ukrainian,xmacukrainian,", 61062, L"" }, - { NCP_8BIT | NCP_RECODE, 1258, "windows-1258,windows-258,", 61063, L"" }, - { NCP_8BIT | NCP_RECODE, 850, "ibm850,ibm850,", 61064, L"" }, - { NCP_8BIT | NCP_RECODE, 28591, "iso-8859-1,iso88591,cp819,latin1,ibm819,isoir100,latin1,l1,", 61065, L"" }, - { NCP_8BIT | NCP_RECODE, 10000, "macintosh,macintosh,", 61066, L"" }, - { NCP_8BIT | NCP_RECODE, 1252, "windows-1252,windows1252,cp367,cp819,ibm367,us,xansi,", 61067, L"" }, - { NCP_8BIT | NCP_RECODE, 37, "ebcdic-cp-us,ebcdiccpus,ebcdiccpca,ebcdiccpwt,ebcdiccpnl,ibm037,cp037,", 61068, L"" }, - { NCP_8BIT | NCP_RECODE, 500, "x-ebcdic-international,xebcdicinternational,", 61069, L"" }, - { NCP_8BIT | NCP_RECODE, 875, "x-EBCDIC-GreekModern,xebcdicgreekmodern,", 61070, L"" }, - { NCP_8BIT | NCP_RECODE, 1026, "CP1026,cp1026,csibm1026,ibm1026,", 61071, L"" }, - //{ NCP_8BIT|NCP_RECODE, 870, "CP870,cp870,ebcdiccproece,ebcdiccpyu,csibm870,ibm870,", 00000, L"" }, // IBM EBCDIC (Multilingual Latin-2) - //{ NCP_8BIT|NCP_RECODE, 1047, "IBM01047,ibm01047,", 00000, L"" }, // IBM EBCDIC (Open System Latin-1) - //{ NCP_8BIT|NCP_RECODE, 1140, "x-ebcdic-cp-us-euro,xebcdiccpuseuro,", 00000, L"" }, // IBM EBCDIC (US-Canada-Euro) - //{ NCP_8BIT|NCP_RECODE, 1141, "x-ebcdic-germany-euro,xebcdicgermanyeuro,", 00000, L"" }, // IBM EBCDIC (Germany-Euro) - //{ NCP_8BIT|NCP_RECODE, 1142, "x-ebcdic-denmarknorway-euro,xebcdicdenmarknorwayeuro,", 00000, L"" }, // IBM EBCDIC (Denmark-Norway-Euro) - //{ NCP_8BIT|NCP_RECODE, 1143, "x-ebcdic-finlandsweden-euro,xebcdicfinlandswedeneuro,", 00000, L"" }, // IBM EBCDIC (Finland-Sweden-Euro) - //{ NCP_8BIT|NCP_RECODE, 1144, "x-ebcdic-italy-euro,xebcdicitalyeuro,", 00000, L"" }, // IBM EBCDIC (Italy-Euro) - //{ NCP_8BIT|NCP_RECODE, 1145, "x-ebcdic-spain-euro,xebcdicspaineuro,", 00000, L"" }, // IBM EBCDIC (Spain-Latin America-Euro) - //{ NCP_8BIT|NCP_RECODE, 1146, "x-ebcdic-uk-euro,xebcdicukeuro,", 00000, L"" }, // IBM EBCDIC (UK-Euro) - //{ NCP_8BIT|NCP_RECODE, 1147, "x-ebcdic-france-euro,xebcdicfranceeuro,", 00000, L"" }, // IBM EBCDIC (France-Euro) - //{ NCP_8BIT|NCP_RECODE, 1148, "x-ebcdic-international-euro,xebcdicinternationaleuro,", 00000, L"" }, // IBM EBCDIC (International-Euro) - //{ NCP_8BIT|NCP_RECODE, 1149, "x-ebcdic-icelandic-euro,xebcdicicelandiceuro,", 00000, L"" }, // IBM EBCDIC (Icelandic-Euro) - //{ NCP_8BIT|NCP_RECODE, 1361, "johab,johab,", 00000, L"" }, // Korean (Johab) - //{ NCP_8BIT|NCP_RECODE, 20273, "x-EBCDIC-Germany,xebcdicgermany,", 00000, L"" }, // IBM EBCDIC (Germany) - //{ NCP_8BIT|NCP_RECODE, 20277, "x-EBCDIC-DenmarkNorway,xebcdicdenmarknorway,ebcdiccpdk,ebcdiccpno,", 00000, L"" }, // IBM EBCDIC (Denmark-Norway) - //{ NCP_8BIT|NCP_RECODE, 20278, "x-EBCDIC-FinlandSweden,xebcdicfinlandsweden,ebcdicpfi,ebcdiccpse,", 00000, L"" }, // IBM EBCDIC (Finland-Sweden) - //{ NCP_8BIT|NCP_RECODE, 20280, "x-EBCDIC-Italy,xebcdicitaly,", 00000, L"" }, // IBM EBCDIC (Italy) - //{ NCP_8BIT|NCP_RECODE, 20284, "x-EBCDIC-Spain,xebcdicspain,ebcdiccpes,", 00000, L"" }, // IBM EBCDIC (Spain-Latin America) - //{ NCP_8BIT|NCP_RECODE, 20285, "x-EBCDIC-UK,xebcdicuk,ebcdiccpgb,", 00000, L"" }, // IBM EBCDIC (UK) - //{ NCP_8BIT|NCP_RECODE, 20290, "x-EBCDIC-JapaneseKatakana,xebcdicjapanesekatakana,", 00000, L"" }, // IBM EBCDIC (Japanese Katakana) - //{ NCP_8BIT|NCP_RECODE, 20297, "x-EBCDIC-France,xebcdicfrance,ebcdiccpfr,", 00000, L"" }, // IBM EBCDIC (France) - //{ NCP_8BIT|NCP_RECODE, 20420, "x-EBCDIC-Arabic,xebcdicarabic,ebcdiccpar1,", 00000, L"" }, // IBM EBCDIC (Arabic) - //{ NCP_8BIT|NCP_RECODE, 20423, "x-EBCDIC-Greek,xebcdicgreek,ebcdiccpgr,", 00000, L"" }, // IBM EBCDIC (Greek) - //{ NCP_8BIT|NCP_RECODE, 20424, "x-EBCDIC-Hebrew,xebcdichebrew,ebcdiccphe,", 00000, L"" }, // IBM EBCDIC (Hebrew) - //{ NCP_8BIT|NCP_RECODE, 20833, "x-EBCDIC-KoreanExtended,xebcdickoreanextended,", 00000, L"" }, // IBM EBCDIC (Korean Extended) - //{ NCP_8BIT|NCP_RECODE, 20838, "x-EBCDIC-Thai,xebcdicthai,ibmthai,csibmthai,", 00000, L"" }, // IBM EBCDIC (Thai) - //{ NCP_8BIT|NCP_RECODE, 20871, "x-EBCDIC-Icelandic,xebcdicicelandic,ebcdiccpis,", 00000, L"" }, // IBM EBCDIC (Icelandic) - //{ NCP_8BIT|NCP_RECODE, 20880, "x-EBCDIC-CyrillicRussian,xebcdiccyrillicrussian,ebcdiccyrillic,", 00000, L"" }, // IBM EBCDIC (Cyrillic Russian) - //{ NCP_8BIT|NCP_RECODE, 20905, "x-EBCDIC-Turkish,xebcdicturkish,ebcdiccptr,", 00000, L"" }, // IBM EBCDIC (Turkish) - //{ NCP_8BIT|NCP_RECODE, 20924, "IBM00924,ibm00924,ebcdiclatin9euro,", 00000, L"" }, // IBM EBCDIC (Open System-Euro Latin-1) - //{ NCP_8BIT|NCP_RECODE, 21025, "x-EBCDIC-CyrillicSerbianBulgarian,xebcdiccyrillicserbianbulgarian,", 00000, L"" }, // IBM EBCDIC (Cyrillic Serbian-Bulgarian) - //{ NCP_8BIT|NCP_RECODE, 50930, "x-EBCDIC-JapaneseAndKana,xebcdicjapaneseandkana,", 00000, L"" }, // IBM EBCDIC (Japanese and Japanese Katakana) - //{ NCP_8BIT|NCP_RECODE, 50931, "x-EBCDIC-JapaneseAndUSCanada,xebcdicjapaneseanduscanada,", 00000, L"" }, // IBM EBCDIC (Japanese and US-Canada) - //{ NCP_8BIT|NCP_RECODE, 50933, "x-EBCDIC-KoreanAndKoreanExtended,xebcdickoreanandkoreanextended,", 00000, L"" }, // IBM EBCDIC (Korean and Korean Extended) - //{ NCP_8BIT|NCP_RECODE, 50935, "x-EBCDIC-SimplifiedChinese,xebcdicsimplifiedchinese,", 00000, L"" }, // IBM EBCDIC (Chinese Simplified) - //{ NCP_8BIT|NCP_RECODE, 50937, "x-EBCDIC-TraditionalChinese,xebcdictraditionalchinese,", 00000, L"" }, // IBM EBCDIC (Chinese Traditional) - //{ NCP_8BIT|NCP_RECODE, 50939, "x-EBCDIC-JapaneseAndJapaneseLatin,xebcdicjapaneseandjapaneselatin,", 00000, L"" }, // IBM EBCDIC (Japanese and Japanese-Latin) - //{ NCP_8BIT|NCP_RECODE, 20105, "x-IA5,xia5,", 00000, L"" }, // Western European (IA5) - //{ NCP_8BIT|NCP_RECODE, 20106, "x-IA5-German,xia5german,", 00000, L"" }, // German (IA5) - //{ NCP_8BIT|NCP_RECODE, 20107, "x-IA5-Swedish,xia5swedish,", 00000, L"" }, // Swedish (IA5) - //{ NCP_8BIT|NCP_RECODE, 20108, "x-IA5-Norwegian,xia5norwegian,", 00000, L"" }, // Norwegian (IA5) - //{ NCP_8BIT|NCP_RECODE, 20936, "x-cp20936,xcp20936,", 00000, L"" }, // Chinese Simplified (GB2312) - //{ NCP_8BIT|NCP_RECODE, 20932, "euc-jp,,", 00000, L"" }, // Japanese (JIS X 0208-1990 & 0212-1990) - //{ NCP_8BIT|NCP_RECODE, 50220, "iso-2022-jp,iso2022jp,", 00000, L"" }, // Japanese (JIS) - //{ NCP_8BIT|NCP_RECODE, 50221, "csISO2022JP,csiso2022jp,", 00000, L"" }, // Japanese (JIS-Allow 1 byte Kana) - //{ NCP_8BIT|NCP_RECODE, 50222, "_iso-2022-jp$SIO,iso2022jpSIO,", 00000, L"" }, // Japanese (JIS-Allow 1 byte Kana - SO/SI) - //{ NCP_8BIT|NCP_RECODE, 50225, "iso-2022-kr,iso2022kr,csiso2022kr,", 00000, L"" }, // Korean (ISO-2022-KR) - //{ NCP_8BIT|NCP_RECODE, 50227, "x-cp50227,xcp50227,", 00000, L"" }, // Chinese Simplified (ISO-2022) - //{ NCP_8BIT|NCP_RECODE, 50229, "iso-2022-cn,iso2022cn,", 00000, L"" }, // Chinese Traditional (ISO-2022) - //{ NCP_8BIT|NCP_RECODE, 20000, "x-Chinese-CNS,xchinesecns,", 00000, L"" }, // Chinese Traditional (CNS) - //{ NCP_8BIT|NCP_RECODE, 20002, "x-Chinese-Eten,xchineseeten,", 00000, L"" }, // Chinese Traditional (Eten) - //{ NCP_8BIT|NCP_RECODE, 51932, "euc-jp,eucjp,xeuc,xeucjp,", 00000, L"" }, // Japanese (EUC) - //{ NCP_8BIT|NCP_RECODE, 51936, "euc-cn,euccn,xeuccn,", 00000, L"" }, // Chinese Simplified (EUC) - //{ NCP_8BIT|NCP_RECODE, 51949, "euc-kr,euckr,cseuckr,", 00000, L"" }, // Korean (EUC) - //{ NCP_8BIT|NCP_RECODE, 52936, "hz-gb-2312,hzgb2312,hz,", 00000, L"" }, // Chinese Simplified (HZ-GB2312) - { NCP_8BIT | NCP_RECODE, 54936, "gb18030,gb18030,", 61072, L"" } // Chinese Simplified (GB18030) - //{ NCP_8BIT|NCP_RECODE, 57002, "x-iscii-de,xisciide,", 00000, L"" }, // ISCII Devanagari - //{ NCP_8BIT|NCP_RECODE, 57003, "x-iscii-be,xisciibe,", 00000, L"" }, // ISCII Bengali - //{ NCP_8BIT|NCP_RECODE, 57004, "x-iscii-ta,xisciita,", 00000, L"" }, // ISCII Tamil - //{ NCP_8BIT|NCP_RECODE, 57005, "x-iscii-te,xisciite,", 00000, L"" }, // ISCII Telugu - //{ NCP_8BIT|NCP_RECODE, 57006, "x-iscii-as,xisciias,", 00000, L"" }, // ISCII Assamese - //{ NCP_8BIT|NCP_RECODE, 57007, "x-iscii-or,xisciior,", 00000, L"" }, // ISCII Oriya - //{ NCP_8BIT|NCP_RECODE, 57008, "x-iscii-ka,xisciika,", 00000, L"" }, // ISCII Kannada - //{ NCP_8BIT|NCP_RECODE, 57009, "x-iscii-ma,xisciima,", 00000, L"" }, // ISCII Malayalam - //{ NCP_8BIT|NCP_RECODE, 57010, "x-iscii-gu,xisciigu,", 00000, L"" }, // ISCII Gujarathi - //{ NCP_8BIT|NCP_RECODE, 57011, "x-iscii-pa,xisciipa,", 00000, L"" }, // ISCII Panjabi + { NCP_ANSI | NCP_RECODE, CP_ACP, "ansi,system,ascii,", 61000, L"" }, + { NCP_OEM | NCP_RECODE, CP_OEMCP, "oem,oem,", 61001, L"" }, + { NCP_UNICODE | NCP_UNICODE_BOM, CP_UTF8, "", 61002, L"" }, + { NCP_UNICODE | NCP_UNICODE_REVERSE | NCP_UNICODE_BOM, CP_UTF8, "", 61003, L"" }, + { NCP_UNICODE | NCP_RECODE, CP_UTF8, "utf-16,utf16,unicode,", 61004, L"" }, + { NCP_UNICODE | NCP_UNICODE_REVERSE | NCP_RECODE, CP_UTF8, "utf-16be,utf16be,unicodebe,", 61005, L"" }, + { NCP_UTF8 | NCP_RECODE, CP_UTF8, "utf-8,utf8,", 61006, L"" }, + { NCP_UTF8 | NCP_UTF8_SIGN, CP_UTF8, "utf-8,utf8,", 61007, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, CP_UTF7, "utf-7,utf7,", 61008, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 720, "DOS-720,dos720,", 61009, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 28596, "iso-8859-6,iso88596,arabic,csisolatinarabic,ecma114,isoir127,", 61010, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 10004, "x-mac-arabic,xmacarabic,", 61011, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 1256, "windows-1256,windows1256,cp1256", 61012, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 775, "ibm775,ibm775,cp500,", 61013, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 28594, "iso-8859-4,iso88594,csisolatin4,isoir110,l4,latin4,", 61014, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 1257, "windows-1257,windows1257,", 61015, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 852, "ibm852,ibm852,cp852,", 61016, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 28592, "iso-8859-2,iso88592,csisolatin2,isoir101,latin2,l2,", 61017, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 10029, "x-mac-ce,xmacce,", 61018, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 1250, "windows-1250,windows1250,xcp1250,", 61019, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 936, "gb2312,gb2312,chinese,cngb,csgb2312,csgb231280,gb231280,gbk,", 61020, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 10008, "x-mac-chinesesimp,xmacchinesesimp,", 61021, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 950, "big5,big5,cnbig5,csbig5,xxbig5,", 61022, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 10002, "x-mac-chinesetrad,xmacchinesetrad,", 61023, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 10082, "x-mac-croatian,xmaccroatian,", 61024, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 866, "cp866,cp866,ibm866,", 61025, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 28595, "iso-8859-5,iso88595,csisolatin5,csisolatincyrillic,cyrillic,isoir144,", 61026, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 20866, "koi8-r,koi8r,cskoi8r,koi,koi8,", 61027, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 21866, "koi8-u,koi8u,koi8ru,", 61028, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 10007, "x-mac-cyrillic,xmaccyrillic,", 61029, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 1251, "windows-1251,windows1251,xcp1251,", 61030, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 28603, "iso-8859-13,iso885913,", 61031, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 863, "ibm863,ibm863,", 61032, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 737, "ibm737,ibm737,", 61033, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 28597, "iso-8859-7,iso88597,csisolatingreek,ecma118,elot928,greek,greek8,isoir126,", 61034, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 10006, "x-mac-greek,xmacgreek,", 61035, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 1253, "windows-1253,windows1253,", 61036, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 869, "ibm869,ibm869,", 61037, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 862, "DOS-862,dos862,", 61038, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 38598, "iso-8859-8-i,iso88598i,logical,", 61039, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 28598, "iso-8859-8,iso88598,csisolatinhebrew,hebrew,isoir138,visual,", 61040, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 10005, "x-mac-hebrew,xmachebrew,", 61041, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 1255, "windows-1255,windows1255,", 61042, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 861, "ibm861,ibm861,", 61043, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 10079, "x-mac-icelandic,xmacicelandic,", 61044, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 10001, "x-mac-japanese,xmacjapanese,", 61045, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 932, "shift_jis,shiftjis,shiftjs,csshiftjis,cswindows31j,mskanji,xmscp932,xsjis,", 61046, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 10003, "x-mac-korean,xmackorean,", 61047, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 949, "windows-949,windows949,ksc56011987,csksc5601,euckr,isoir149,korean,ksc56011989", 61048, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 28593, "iso-8859-3,iso88593,latin3,isoir109,l3,", 61049, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 28605, "iso-8859-15,iso885915,latin9,l9,", 61050, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 865, "ibm865,ibm865,", 61051, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 437, "ibm437,ibm437,437,cp437,cspc8,codepage437,", 61052, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 858, "ibm858,ibm858,ibm00858,", 61053, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 860, "ibm860,ibm860,", 61054, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 10010, "x-mac-romanian,xmacromanian,", 61055, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 10021, "x-mac-thai,xmacthai,", 61056, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 874, "windows-874,windows874,dos874,iso885911,tis620,", 61057, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 857, "ibm857,ibm857,", 61058, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 28599, "iso-8859-9,iso88599,latin5,isoir148,l5,", 61059, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 10081, "x-mac-turkish,xmacturkish,", 61060, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 1254, "windows-1254,windows1254,", 61061, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 10017, "x-mac-ukrainian,xmacukrainian,", 61062, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 1258, "windows-1258,windows-258,", 61063, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 850, "ibm850,ibm850,", 61064, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 28591, "iso-8859-1,iso88591,cp819,latin1,ibm819,isoir100,latin1,l1,", 61065, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 10000, "macintosh,macintosh,", 61066, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 1252, "windows-1252,windows1252,cp367,cp819,ibm367,us,xansi,", 61067, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 37, "ebcdic-cp-us,ebcdiccpus,ebcdiccpca,ebcdiccpwt,ebcdiccpnl,ibm037,cp037,", 61068, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 500, "x-ebcdic-international,xebcdicinternational,", 61069, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 875, "x-EBCDIC-GreekModern,xebcdicgreekmodern,", 61070, L"" }, + { NCP_EXTERNAL_8BIT | NCP_RECODE, 1026, "CP1026,cp1026,csibm1026,ibm1026,", 61071, L"" }, + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 870, "CP870,cp870,ebcdiccproece,ebcdiccpyu,csibm870,ibm870,", 00000, L"" }, // IBM EBCDIC (Multilingual Latin-2) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1047, "IBM01047,ibm01047,", 00000, L"" }, // IBM EBCDIC (Open System Latin-1) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1140, "x-ebcdic-cp-us-euro,xebcdiccpuseuro,", 00000, L"" }, // IBM EBCDIC (US-Canada-Euro) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1141, "x-ebcdic-germany-euro,xebcdicgermanyeuro,", 00000, L"" }, // IBM EBCDIC (Germany-Euro) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1142, "x-ebcdic-denmarknorway-euro,xebcdicdenmarknorwayeuro,", 00000, L"" }, // IBM EBCDIC (Denmark-Norway-Euro) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1143, "x-ebcdic-finlandsweden-euro,xebcdicfinlandswedeneuro,", 00000, L"" }, // IBM EBCDIC (Finland-Sweden-Euro) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1144, "x-ebcdic-italy-euro,xebcdicitalyeuro,", 00000, L"" }, // IBM EBCDIC (Italy-Euro) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1145, "x-ebcdic-spain-euro,xebcdicspaineuro,", 00000, L"" }, // IBM EBCDIC (Spain-Latin America-Euro) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1146, "x-ebcdic-uk-euro,xebcdicukeuro,", 00000, L"" }, // IBM EBCDIC (UK-Euro) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1147, "x-ebcdic-france-euro,xebcdicfranceeuro,", 00000, L"" }, // IBM EBCDIC (France-Euro) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1148, "x-ebcdic-international-euro,xebcdicinternationaleuro,", 00000, L"" }, // IBM EBCDIC (International-Euro) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1149, "x-ebcdic-icelandic-euro,xebcdicicelandiceuro,", 00000, L"" }, // IBM EBCDIC (Icelandic-Euro) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1361, "johab,johab,", 00000, L"" }, // Korean (Johab) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20273, "x-EBCDIC-Germany,xebcdicgermany,", 00000, L"" }, // IBM EBCDIC (Germany) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20277, "x-EBCDIC-DenmarkNorway,xebcdicdenmarknorway,ebcdiccpdk,ebcdiccpno,", 00000, L"" }, // IBM EBCDIC (Denmark-Norway) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20278, "x-EBCDIC-FinlandSweden,xebcdicfinlandsweden,ebcdicpfi,ebcdiccpse,", 00000, L"" }, // IBM EBCDIC (Finland-Sweden) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20280, "x-EBCDIC-Italy,xebcdicitaly,", 00000, L"" }, // IBM EBCDIC (Italy) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20284, "x-EBCDIC-Spain,xebcdicspain,ebcdiccpes,", 00000, L"" }, // IBM EBCDIC (Spain-Latin America) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20285, "x-EBCDIC-UK,xebcdicuk,ebcdiccpgb,", 00000, L"" }, // IBM EBCDIC (UK) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20290, "x-EBCDIC-JapaneseKatakana,xebcdicjapanesekatakana,", 00000, L"" }, // IBM EBCDIC (Japanese Katakana) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20297, "x-EBCDIC-France,xebcdicfrance,ebcdiccpfr,", 00000, L"" }, // IBM EBCDIC (France) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20420, "x-EBCDIC-Arabic,xebcdicarabic,ebcdiccpar1,", 00000, L"" }, // IBM EBCDIC (Arabic) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20423, "x-EBCDIC-Greek,xebcdicgreek,ebcdiccpgr,", 00000, L"" }, // IBM EBCDIC (Greek) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20424, "x-EBCDIC-Hebrew,xebcdichebrew,ebcdiccphe,", 00000, L"" }, // IBM EBCDIC (Hebrew) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20833, "x-EBCDIC-KoreanExtended,xebcdickoreanextended,", 00000, L"" }, // IBM EBCDIC (Korean Extended) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20838, "x-EBCDIC-Thai,xebcdicthai,ibmthai,csibmthai,", 00000, L"" }, // IBM EBCDIC (Thai) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20871, "x-EBCDIC-Icelandic,xebcdicicelandic,ebcdiccpis,", 00000, L"" }, // IBM EBCDIC (Icelandic) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20880, "x-EBCDIC-CyrillicRussian,xebcdiccyrillicrussian,ebcdiccyrillic,", 00000, L"" }, // IBM EBCDIC (Cyrillic Russian) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20905, "x-EBCDIC-Turkish,xebcdicturkish,ebcdiccptr,", 00000, L"" }, // IBM EBCDIC (Turkish) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20924, "IBM00924,ibm00924,ebcdiclatin9euro,", 00000, L"" }, // IBM EBCDIC (Open System-Euro Latin-1) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 21025, "x-EBCDIC-CyrillicSerbianBulgarian,xebcdiccyrillicserbianbulgarian,", 00000, L"" }, // IBM EBCDIC (Cyrillic Serbian-Bulgarian) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50930, "x-EBCDIC-JapaneseAndKana,xebcdicjapaneseandkana,", 00000, L"" }, // IBM EBCDIC (Japanese and Japanese Katakana) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50931, "x-EBCDIC-JapaneseAndUSCanada,xebcdicjapaneseanduscanada,", 00000, L"" }, // IBM EBCDIC (Japanese and US-Canada) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50933, "x-EBCDIC-KoreanAndKoreanExtended,xebcdickoreanandkoreanextended,", 00000, L"" }, // IBM EBCDIC (Korean and Korean Extended) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50935, "x-EBCDIC-SimplifiedChinese,xebcdicsimplifiedchinese,", 00000, L"" }, // IBM EBCDIC (Chinese Simplified) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50937, "x-EBCDIC-TraditionalChinese,xebcdictraditionalchinese,", 00000, L"" }, // IBM EBCDIC (Chinese Traditional) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50939, "x-EBCDIC-JapaneseAndJapaneseLatin,xebcdicjapaneseandjapaneselatin,", 00000, L"" }, // IBM EBCDIC (Japanese and Japanese-Latin) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20105, "x-IA5,xia5,", 00000, L"" }, // Western European (IA5) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20106, "x-IA5-German,xia5german,", 00000, L"" }, // German (IA5) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20107, "x-IA5-Swedish,xia5swedish,", 00000, L"" }, // Swedish (IA5) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20108, "x-IA5-Norwegian,xia5norwegian,", 00000, L"" }, // Norwegian (IA5) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20936, "x-cp20936,xcp20936,", 00000, L"" }, // Chinese Simplified (GB2312) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20932, "euc-jp,,", 00000, L"" }, // Japanese (JIS X 0208-1990 & 0212-1990) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50220, "iso-2022-jp,iso2022jp,", 00000, L"" }, // Japanese (JIS) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50221, "csISO2022JP,csiso2022jp,", 00000, L"" }, // Japanese (JIS-Allow 1 byte Kana) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50222, "_iso-2022-jp$SIO,iso2022jpSIO,", 00000, L"" }, // Japanese (JIS-Allow 1 byte Kana - SO/SI) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50225, "iso-2022-kr,iso2022kr,csiso2022kr,", 00000, L"" }, // Korean (ISO-2022-KR) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50227, "x-cp50227,xcp50227,", 00000, L"" }, // Chinese Simplified (ISO-2022) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50229, "iso-2022-cn,iso2022cn,", 00000, L"" }, // Chinese Traditional (ISO-2022) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20000, "x-Chinese-CNS,xchinesecns,", 00000, L"" }, // Chinese Traditional (CNS) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20002, "x-Chinese-Eten,xchineseeten,", 00000, L"" }, // Chinese Traditional (Eten) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 51932, "euc-jp,eucjp,xeuc,xeucjp,", 00000, L"" }, // Japanese (EUC) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 51936, "euc-cn,euccn,xeuccn,", 00000, L"" }, // Chinese Simplified (EUC) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 51949, "euc-kr,euckr,cseuckr,", 00000, L"" }, // Korean (EUC) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 52936, "hz-gb-2312,hzgb2312,hz,", 00000, L"" }, // Chinese Simplified (HZ-GB2312) + { NCP_EXTERNAL_8BIT | NCP_RECODE, 54936, "gb18030,gb18030,", 61072, L"" } // Chinese Simplified (GB18030) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57002, "x-iscii-de,xisciide,", 00000, L"" }, // ISCII Devanagari + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57003, "x-iscii-be,xisciibe,", 00000, L"" }, // ISCII Bengali + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57004, "x-iscii-ta,xisciita,", 00000, L"" }, // ISCII Tamil + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57005, "x-iscii-te,xisciite,", 00000, L"" }, // ISCII Telugu + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57006, "x-iscii-as,xisciias,", 00000, L"" }, // ISCII Assamese + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57007, "x-iscii-or,xisciior,", 00000, L"" }, // ISCII Oriya + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57008, "x-iscii-ka,xisciika,", 00000, L"" }, // ISCII Kannada + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57009, "x-iscii-ma,xisciima,", 00000, L"" }, // ISCII Malayalam + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57010, "x-iscii-gu,xisciigu,", 00000, L"" }, // ISCII Gujarathi + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57011, "x-iscii-pa,xisciipa,", 00000, L"" }, // ISCII Panjabi }; int Encoding_CountOf() { @@ -2925,7 +2925,7 @@ int Encoding_Current(int iEncoding) { } -int Encoding_Source(int iSrcEncoding) { +int Encoding_SrcCmdLn(int iSrcEncoding) { static int SourceEncoding = CPI_NONE; if (iSrcEncoding >= 0) { @@ -2983,8 +2983,8 @@ void Encoding_InitDefaults() { for (int i = CPI_UTF7 + 1; i < COUNTOF(g_Encodings); ++i) { if (Encoding_IsValid(i) && (g_Encodings[i].uCodePage == g_Encodings[CPI_ANSI_DEFAULT].uCodePage)) { g_Encodings[i].uFlags |= NCP_ANSI; - if (g_Encodings[i].uFlags & NCP_8BIT) - g_Encodings[CPI_ANSI_DEFAULT].uFlags |= NCP_8BIT; + if (g_Encodings[i].uFlags & NCP_EXTERNAL_8BIT) + g_Encodings[CPI_ANSI_DEFAULT].uFlags |= NCP_EXTERNAL_8BIT; break; } } @@ -2995,8 +2995,8 @@ void Encoding_InitDefaults() { for (int i = CPI_UTF7 + 1; i < COUNTOF(g_Encodings); ++i) { if (Encoding_IsValid(i) && (g_Encodings[i].uCodePage == g_Encodings[CPI_OEM].uCodePage)) { g_Encodings[i].uFlags |= NCP_OEM; - if (g_Encodings[i].uFlags & NCP_8BIT) - g_Encodings[CPI_OEM].uFlags |= NCP_8BIT; + if (g_Encodings[i].uFlags & NCP_EXTERNAL_8BIT) + g_Encodings[CPI_OEM].uFlags |= NCP_EXTERNAL_8BIT; break; } } @@ -3282,8 +3282,6 @@ void Encoding_AddToComboboxEx(HWND hwnd,int idSel,BOOL bRecodeOnly) { int id = pEE[i].id; if (!bRecodeOnly || (g_Encodings[id].uFlags & NCP_RECODE)) { - CPINFO cpi; - cbei.iItem = SendMessage(hwnd,CB_GETCOUNT,0,0); WCHAR *pwsz = StrChr(pEE[i].wch,L';'); @@ -3301,12 +3299,7 @@ void Encoding_AddToComboboxEx(HWND hwnd,int idSel,BOOL bRecodeOnly) { else if (id == CPI_OEM) StringCchCatN(wchBuf,COUNTOF(wchBuf),wchOEM,COUNTOF(wchOEM)); - if ((g_Encodings[id].uFlags & NCP_INTERNAL) || - (IsValidCodePage(g_Encodings[id].uCodePage) && - GetCPInfo(g_Encodings[id].uCodePage,&cpi))) - cbei.iImage = 0; - else - cbei.iImage = 1; + cbei.iImage = (Encoding_IsValid(id) ? 0 : 1); cbei.lParam = (LPARAM)id; SendMessage(hwnd,CBEM_INSERTITEM,0,(LPARAM)&cbei); @@ -3341,6 +3334,8 @@ BOOL Encoding_GetFromComboboxEx(HWND hwnd,int *pidEncoding) { } + + BOOL Encoding_IsDefault(int iEncoding) { return (g_Encodings[iEncoding].uFlags & NCP_DEFAULT); } @@ -3354,6 +3349,8 @@ BOOL Encoding_IsOEM(int iEncoding) { } UINT Encoding_SciGetCodePage(HWND hwnd) { + UNUSED(hwnd); + return CP_UTF8; // remove internal support for Chinese, Japan, Korean DBCS use UTF-8 instead /* int cp = (UINT)SendMessage(hwnd,SCI_GETCODEPAGE,0,0); @@ -3362,12 +3359,12 @@ UINT Encoding_SciGetCodePage(HWND hwnd) { } return (cp == 0) ? CP_ACP : CP_UTF8; */ - UNUSED(hwnd); - return CP_UTF8; } int Encoding_SciMappedCodePage(int iEncoding) { + UNUSED(iEncoding); + return SC_CP_UTF8; // remove internal support for Chinese, Japan, Korean DBCS use UTF-8 instead /* if (Encoding_IsValid(iEncoding)) { @@ -3378,8 +3375,6 @@ int Encoding_SciMappedCodePage(int iEncoding) { } } */ - UNUSED(iEncoding); - return SC_CP_UTF8; } @@ -3403,7 +3398,7 @@ void Encoding_SciSetCodePage(HWND hwnd,int iEncoding) { charset = SC_CHARSET_CHINESEBIG5; break; default: - charset = iDefaultCharSet; + charset = g_iDefaultCharSet; break; } SendMessage(hwnd,SCI_STYLESETCHARACTERSET,(WPARAM)STYLE_DEFAULT,(LPARAM)charset); diff --git a/src/Helpers.h b/src/Helpers.h index f63789cb7..9ebcbcf0e 100644 --- a/src/Helpers.h +++ b/src/Helpers.h @@ -357,12 +357,12 @@ extern int g_DOSEncoding; #define NCP_UNICODE 8 #define NCP_UNICODE_REVERSE 16 #define NCP_UNICODE_BOM 32 -#define NCP_8BIT 64 -#define NCP_ANSI 128 -#define NCP_OEM 256 -#define NCP_MBCS 512 -#define NCP_INTERNAL (NCP_DEFAULT|NCP_UTF8|NCP_UTF8_SIGN|NCP_UNICODE|NCP_UNICODE_REVERSE|NCP_UNICODE_BOM|NCP_ANSI|NCP_OEM|NCP_MBCS) -#define NCP_RECODE 1024 +#define NCP_ANSI 64 +#define NCP_OEM 128 +#define NCP_MBCS 256 +#define NCP_INTERNAL (NCP_DEFAULT|NCP_UTF8|NCP_UTF8_SIGN|NCP_UNICODE|NCP_UNICODE_REVERSE|NCP_UNICODE_BOM|NCP_ANSI|NCP_OEM|NCP_MBCS) +#define NCP_EXTERNAL_8BIT 512 +#define NCP_RECODE 1024 #define CPI_GET -2 #define CPI_NONE -1 @@ -389,7 +389,7 @@ typedef struct _np2encoding { int Encoding_CountOf(); int Encoding_Current(int); // getter/setter -int Encoding_Source(int); // getter/setter +int Encoding_SrcCmdLn(int); // getter/setter int Encoding_SrcWeak(int); // getter/setter BOOL Encoding_HasChanged(int); // query/setter diff --git a/src/Notepad3.c b/src/Notepad3.c index 8c286e1d2..5abe671df 100644 --- a/src/Notepad3.c +++ b/src/Notepad3.c @@ -254,11 +254,11 @@ LPMRULIST mruReplace; DWORD dwLastIOError; -int iDefaultEncoding; -int iDefaultCharSet; +int g_iDefaultEncoding; +int g_iDefaultCharSet; -int iEOLMode; -int iDefaultEOLMode; +int g_iEOLMode; +int g_iDefaultEOLMode; int iInitialLine; int iInitialColumn; @@ -282,7 +282,8 @@ UINT msgTaskbarCreated = 0; HMODULE hModUxTheme = NULL; HMODULE hRichEdit = NULL; -EDITFINDREPLACE g_efrData = { "", "", "", "", 0, 0, 0, 0, 0, 0, 0, 0, 0, NULL }; + +EDITFINDREPLACE g_efrData = EFR_INIT_DATA; UINT cpLastFind = 0; BOOL bReplaceInitialized = FALSE; @@ -734,7 +735,7 @@ HWND InitInstance(HINSTANCE hInstance,LPSTR pszCmdLine,int nCmdShow) // Source Encoding if (lpEncodingArg) - Encoding_Source(Encoding_MatchW(lpEncodingArg)); + Encoding_SrcCmdLn(Encoding_MatchW(lpEncodingArg)); // Pathname parameter if (flagBufferFile || (lpFileArg /*&& !flagNewFromClipboard*/)) @@ -792,15 +793,15 @@ HWND InitInstance(HINSTANCE hInstance,LPSTR pszCmdLine,int nCmdShow) } } else { - if (Encoding_Source(CPI_GET) != CPI_NONE) { - Encoding_Current(Encoding_Source(CPI_GET)); - Encoding_HasChanged(Encoding_Source(CPI_GET)); + if (Encoding_SrcCmdLn(CPI_GET) != CPI_NONE) { + Encoding_Current(Encoding_SrcCmdLn(CPI_GET)); + Encoding_HasChanged(Encoding_SrcCmdLn(CPI_GET)); Encoding_SciSetCodePage(g_hwndEdit,Encoding_Current(CPI_GET)); } } // reset - Encoding_Source(CPI_NONE); + Encoding_SrcCmdLn(CPI_NONE); flagQuietCreate = 0; fKeepTitleExcerpt = 0; @@ -1840,7 +1841,7 @@ LRESULT MsgCopyData(HWND hwnd, WPARAM wParam, LPARAM lParam) if (params->flagFileSpecified) { BOOL bOpened = FALSE; - Encoding_Source(params->iSrcEncoding); + Encoding_SrcCmdLn(params->iSrcEncoding); if (PathIsDirectory(¶ms->wchData)) { WCHAR tchFile[MAX_PATH] = { L'\0' }; @@ -1876,11 +1877,7 @@ LRESULT MsgCopyData(HWND hwnd, WPARAM wParam, LPARAM lParam) if (0 != params->flagSetEOLMode) { flagSetEOLMode = params->flagSetEOLMode; - SendMessage( - g_hwndMain, - WM_COMMAND, - MAKELONG(IDM_LINEENDINGS_CRLF + flagSetEOLMode - 1, 1), - 0); + SendMessage(g_hwndMain, WM_COMMAND, MAKELONG(IDM_LINEENDINGS_CRLF + flagSetEOLMode - 1, 1), 0); flagSetEOLMode = 0; } @@ -1899,7 +1896,7 @@ LRESULT MsgCopyData(HWND hwnd, WPARAM wParam, LPARAM lParam) } } // reset - Encoding_Source(CPI_NONE); + Encoding_SrcCmdLn(CPI_NONE); } if (params->flagJumpTo) { @@ -2127,9 +2124,9 @@ void MsgInitMenu(HWND hwnd,WPARAM wParam,LPARAM lParam) i = -1; CheckMenuRadioItem(hmenu,IDM_ENCODING_ANSI,IDM_ENCODING_UTF8SIGN,i,MF_BYCOMMAND); - if (iEOLMode == SC_EOL_CRLF) + if (g_iEOLMode == SC_EOL_CRLF) i = IDM_LINEENDINGS_CRLF; - else if (iEOLMode == SC_EOL_LF) + else if (g_iEOLMode == SC_EOL_LF) i = IDM_LINEENDINGS_LF; else i = IDM_LINEENDINGS_CR; @@ -2725,7 +2722,7 @@ LRESULT MsgCommand(HWND hwnd, WPARAM wParam, LPARAM lParam) if (RecodeDlg(hwnd,&iNewEncoding)) { StringCchCopy(tchCurFile2,COUNTOF(tchCurFile2),g_wchCurFile); - Encoding_Source(iNewEncoding); + Encoding_SrcCmdLn(iNewEncoding); FileLoad(TRUE,FALSE,TRUE,FALSE,tchCurFile2); } } @@ -2734,7 +2731,7 @@ LRESULT MsgCommand(HWND hwnd, WPARAM wParam, LPARAM lParam) case IDM_ENCODING_SETDEFAULT: - SelectDefEncodingDlg(hwnd,&iDefaultEncoding); + SelectDefEncodingDlg(hwnd,&g_iDefaultEncoding); break; @@ -2744,9 +2741,9 @@ LRESULT MsgCommand(HWND hwnd, WPARAM wParam, LPARAM lParam) { BeginWaitCursor(NULL) int iNewEOLMode = iLineEndings[LOWORD(wParam)-IDM_LINEENDINGS_CRLF]; - iEOLMode = iNewEOLMode; - SendMessage(g_hwndEdit,SCI_SETEOLMODE,iEOLMode,0); - SendMessage(g_hwndEdit,SCI_CONVERTEOLS,iEOLMode,0); + g_iEOLMode = iNewEOLMode; + SendMessage(g_hwndEdit,SCI_SETEOLMODE,g_iEOLMode,0); + SendMessage(g_hwndEdit,SCI_CONVERTEOLS,g_iEOLMode,0); EditFixPositions(g_hwndEdit); EndWaitCursor() UpdateToolbar(); @@ -2756,7 +2753,7 @@ LRESULT MsgCommand(HWND hwnd, WPARAM wParam, LPARAM lParam) case IDM_LINEENDINGS_SETDEFAULT: - SelectDefLineEndingDlg(hwnd,&iDefaultEOLMode); + SelectDefLineEndingDlg(hwnd,&g_iDefaultEOLMode); break; @@ -4469,7 +4466,7 @@ LRESULT MsgCommand(HWND hwnd, WPARAM wParam, LPARAM lParam) { WCHAR tchCurFile2[MAX_PATH] = { L'\0' }; if (StringCchLenW(g_wchCurFile,COUNTOF(g_wchCurFile))) { - Encoding_Source(Encoding_MapUnicode(iDefaultEncoding)); + Encoding_SrcCmdLn(Encoding_MapUnicode(g_iDefaultEncoding)); StringCchCopy(tchCurFile2,COUNTOF(tchCurFile2),g_wchCurFile); FileLoad(FALSE,FALSE,TRUE,TRUE,tchCurFile2); } @@ -4481,7 +4478,7 @@ LRESULT MsgCommand(HWND hwnd, WPARAM wParam, LPARAM lParam) { WCHAR tchCurFile2[MAX_PATH] = { L'\0' }; if (StringCchLenW(g_wchCurFile,COUNTOF(g_wchCurFile))) { - Encoding_Source(CPI_ANSI_DEFAULT); + Encoding_SrcCmdLn(CPI_ANSI_DEFAULT); StringCchCopy(tchCurFile2,COUNTOF(tchCurFile2),g_wchCurFile); FileLoad(FALSE,FALSE,TRUE,TRUE,tchCurFile2); } @@ -4493,7 +4490,7 @@ LRESULT MsgCommand(HWND hwnd, WPARAM wParam, LPARAM lParam) { WCHAR tchCurFile2[MAX_PATH] = { L'\0' }; if (StringCchLenW(g_wchCurFile,COUNTOF(g_wchCurFile))) { - Encoding_Source(CPI_OEM); + Encoding_SrcCmdLn(CPI_OEM); StringCchCopy(tchCurFile2,COUNTOF(tchCurFile2),g_wchCurFile); FileLoad(FALSE,FALSE,TRUE,TRUE,tchCurFile2); } @@ -4566,8 +4563,9 @@ LRESULT MsgCommand(HWND hwnd, WPARAM wParam, LPARAM lParam) struct tm sst; UINT cp; - EDITFINDREPLACE efrTS = { "", "", "", "", SCFIND_REGEXP, 0, 0, 0, 0, 0, 0, 0, 0, NULL }; + EDITFINDREPLACE efrTS = EFR_INIT_DATA; efrTS.hwnd = g_hwndEdit; + efrTS.fuFlags = SCFIND_REGEXP; IniGetString(L"Settings2",L"TimeStamp",L"\\$Date:[^\\$]+\\$ | $Date: %Y/%m/%d %H:%M:%S $",wchFind,COUNTOF(wchFind)); @@ -5372,7 +5370,7 @@ LRESULT MsgNotify(HWND hwnd,WPARAM wParam,LPARAM lParam) if (bAutoIndent && (scn->ch == '\x0D' || scn->ch == '\x0A')) { // in CRLF mode handle LF only... - if ((SC_EOL_CRLF == iEOLMode && scn->ch != '\x0A') || SC_EOL_CRLF != iEOLMode) + if ((SC_EOL_CRLF == g_iEOLMode && scn->ch != '\x0A') || SC_EOL_CRLF != g_iEOLMode) { DocPos iCurPos = SciCall_GetCurrentPos(); DocLn iCurLine = SciCall_LineFromPosition(iCurPos); @@ -5608,9 +5606,9 @@ LRESULT MsgNotify(HWND hwnd,WPARAM wParam,LPARAM lParam) return TRUE; case STATUS_EOLMODE: - if (iEOLMode == SC_EOL_CRLF) + if (g_iEOLMode == SC_EOL_CRLF) i = IDM_LINEENDINGS_CRLF; - else if (iEOLMode == SC_EOL_LF) + else if (g_iEOLMode == SC_EOL_LF) i = IDM_LINEENDINGS_LF; else i = IDM_LINEENDINGS_CR; @@ -5798,10 +5796,10 @@ void LoadSettings() bViewEOLs = IniSectionGetBool(pIniSection,L"ViewEOLs", FALSE); - iDefaultEncoding = IniSectionGetInt(pIniSection,L"DefaultEncoding", CPI_NONE); + g_iDefaultEncoding = IniSectionGetInt(pIniSection,L"DefaultEncoding", CPI_NONE); // if DefaultEncoding is not defined set to system's current code-page - iDefaultEncoding = (iDefaultEncoding == CPI_NONE) ? - Encoding_MapIniSetting(TRUE,(int)GetACP()) : Encoding_MapIniSetting(TRUE,iDefaultEncoding); + g_iDefaultEncoding = (g_iDefaultEncoding == CPI_NONE) ? + Encoding_MapIniSetting(TRUE,(int)GetACP()) : Encoding_MapIniSetting(TRUE,g_iDefaultEncoding); bSkipUnicodeDetection = IniSectionGetBool(pIniSection, L"SkipUnicodeDetection", FALSE); @@ -5811,8 +5809,8 @@ void LoadSettings() bNoEncodingTags = IniSectionGetBool(pIniSection,L"NoEncodingTags", FALSE); - iDefaultEOLMode = IniSectionGetInt(pIniSection,L"DefaultEOLMode",0); - iDefaultEOLMode = max(min(iDefaultEOLMode,2),0); + g_iDefaultEOLMode = IniSectionGetInt(pIniSection,L"DefaultEOLMode",0); + g_iDefaultEOLMode = max(min(g_iDefaultEOLMode,2),0); bFixLineEndings = IniSectionGetBool(pIniSection,L"FixLineEndings",TRUE); @@ -6005,22 +6003,22 @@ void LoadSettings() // remove internal support for Chinese, Japan, Korean DBCS use UTF-8 instead /* - if (iDefaultEncoding == CPI_ANSI_DEFAULT) + if (g_iDefaultEncoding == CPI_ANSI_DEFAULT) { // check for Chinese, Japan, Korean DBCS code pages and switch accordingly int acp = (int)GetACP(); if (acp == 932 || acp == 936 || acp == 949 || acp == 950) { iSciDefaultCodePage = acp; } - iDefaultEncoding = Encoding_GetByCodePage(iSciDefaultCodePage); + g_iDefaultEncoding = Encoding_GetByCodePage(iSciDefaultCodePage); } */ // set flag for encoding default - g_Encodings[iDefaultEncoding].uFlags |= NCP_DEFAULT; + g_Encodings[g_iDefaultEncoding].uFlags |= NCP_DEFAULT; // define default charset - iDefaultCharSet = (int)CharSetFromCodePage((UINT)iSciDefaultCodePage); + g_iDefaultCharSet = (int)CharSetFromCodePage((UINT)iSciDefaultCodePage); // Scintilla Styles Style_Load(); @@ -6103,12 +6101,12 @@ void SaveSettings(BOOL bSaveSettingsNow) { IniSectionSetBool(pIniSection, L"MarkOccurrencesCurrentWord", bMarkOccurrencesCurrentWord); IniSectionSetBool(pIniSection, L"ViewWhiteSpace", bViewWhiteSpace); IniSectionSetBool(pIniSection, L"ViewEOLs", bViewEOLs); - IniSectionSetInt(pIniSection, L"DefaultEncoding", Encoding_MapIniSetting(FALSE, iDefaultEncoding)); + IniSectionSetInt(pIniSection, L"DefaultEncoding", Encoding_MapIniSetting(FALSE, g_iDefaultEncoding)); IniSectionSetBool(pIniSection, L"SkipUnicodeDetection", bSkipUnicodeDetection); IniSectionSetInt(pIniSection, L"LoadASCIIasUTF8", bLoadASCIIasUTF8); IniSectionSetBool(pIniSection, L"LoadNFOasOEM", bLoadNFOasOEM); IniSectionSetBool(pIniSection, L"NoEncodingTags", bNoEncodingTags); - IniSectionSetInt(pIniSection, L"DefaultEOLMode", iDefaultEOLMode); + IniSectionSetInt(pIniSection, L"DefaultEOLMode", g_iDefaultEOLMode); IniSectionSetBool(pIniSection, L"FixLineEndings", bFixLineEndings); IniSectionSetBool(pIniSection, L"FixTrailingBlanks", bAutoStripBlanks); IniSectionSetInt(pIniSection, L"PrintHeader", iPrintHeader); @@ -7057,11 +7055,11 @@ void UpdateStatusbar() Encoding_SetLabel(iEncoding); StringCchPrintf(tchEncoding, COUNTOF(tchEncoding), L" %s ", g_Encodings[iEncoding].wchLabel); - if (iEOLMode == SC_EOL_CR) + if (g_iEOLMode == SC_EOL_CR) { StringCchCopy(tchEOLMode, COUNTOF(tchEOLMode), L" CR "); } - else if (iEOLMode == SC_EOL_LF) + else if (g_iEOLMode == SC_EOL_LF) { StringCchCopy(tchEOLMode, COUNTOF(tchEOLMode), L" LF "); } @@ -7435,11 +7433,11 @@ BOOL FileLoad(BOOL bDontSave,BOOL bNew,BOOL bReload,BOOL bNoEncDetect,LPCWSTR lp EditSetNewText(g_hwndEdit,"",0); Style_SetLexer(g_hwndEdit,NULL); - iEOLMode = iLineEndings[iDefaultEOLMode]; - SendMessage(g_hwndEdit,SCI_SETEOLMODE,iLineEndings[iDefaultEOLMode],0); - Encoding_Current(iDefaultEncoding); - Encoding_HasChanged(iDefaultEncoding); - Encoding_SciSetCodePage(g_hwndEdit,iDefaultEncoding); + g_iEOLMode = iLineEndings[g_iDefaultEOLMode]; + SendMessage(g_hwndEdit,SCI_SETEOLMODE,iLineEndings[g_iDefaultEOLMode],0); + Encoding_Current(g_iDefaultEncoding); + Encoding_HasChanged(g_iDefaultEncoding); + Encoding_SciSetCodePage(g_hwndEdit,g_iDefaultEncoding); EditSetNewText(g_hwndEdit,"",0); bReadOnly = FALSE; @@ -7505,16 +7503,16 @@ BOOL FileLoad(BOOL bDontSave,BOOL bNew,BOOL bReload,BOOL bNoEncDetect,LPCWSTR lp FileVars_Init(NULL,0,&fvCurFile); EditSetNewText(g_hwndEdit,"",0); Style_SetLexer(g_hwndEdit,NULL); - iEOLMode = iLineEndings[iDefaultEOLMode]; - SendMessage(g_hwndEdit,SCI_SETEOLMODE,iLineEndings[iDefaultEOLMode],0); - if (Encoding_Source(CPI_GET) != CPI_NONE) { - fileEncoding = Encoding_Source(CPI_GET); + g_iEOLMode = iLineEndings[g_iDefaultEOLMode]; + SendMessage(g_hwndEdit,SCI_SETEOLMODE,iLineEndings[g_iDefaultEOLMode],0); + if (Encoding_SrcCmdLn(CPI_GET) != CPI_NONE) { + fileEncoding = Encoding_SrcCmdLn(CPI_GET); Encoding_Current(fileEncoding); Encoding_HasChanged(fileEncoding); } else { - Encoding_Current(iDefaultEncoding); - Encoding_HasChanged(iDefaultEncoding); + Encoding_Current(g_iDefaultEncoding); + Encoding_HasChanged(g_iDefaultEncoding); } Encoding_SciSetCodePage(g_hwndEdit,Encoding_Current(CPI_GET)); bReadOnly = FALSE; @@ -7532,12 +7530,12 @@ BOOL FileLoad(BOOL bDontSave,BOOL bNew,BOOL bReload,BOOL bNoEncDetect,LPCWSTR lp if (!bReload && MRU_FindFile(pFileMRU,szFileName,&idx)) { fileEncoding = pFileMRU->iEncoding[idx]; if (fileEncoding > 0) - Encoding_Source(Encoding_MapUnicode(fileEncoding)); + Encoding_SrcCmdLn(Encoding_MapUnicode(fileEncoding)); } else fileEncoding = Encoding_Current(CPI_GET); - fSuccess = FileIO(TRUE,szFileName,bNoEncDetect,&fileEncoding,&iEOLMode,&bUnicodeErr,&bFileTooBig,&bUnknownExt,NULL,FALSE); + fSuccess = FileIO(TRUE,szFileName,bNoEncDetect,&fileEncoding,&g_iEOLMode,&bUnicodeErr,&bFileTooBig,&bUnknownExt,NULL,FALSE); if (fSuccess) Encoding_Current(fileEncoding); // load may change encoding } @@ -7552,7 +7550,7 @@ BOOL FileLoad(BOOL bDontSave,BOOL bNew,BOOL bReload,BOOL bNoEncDetect,LPCWSTR lp if (!flagLexerSpecified) // flag will be cleared Style_SetLexerFromFile(g_hwndEdit,g_wchCurFile); - SendMessage(g_hwndEdit,SCI_SETEOLMODE,iEOLMode,0); + SendMessage(g_hwndEdit,SCI_SETEOLMODE,g_iEOLMode,0); fileEncoding = Encoding_Current(CPI_GET); Encoding_HasChanged(fileEncoding); int idx, iCaretPos = 0; @@ -7642,6 +7640,7 @@ BOOL FileRevert(LPCWSTR szFileName) //BOOL bIsTail = (iCurPos == iAnchorPos) && (iCurPos == SendMessage(g_hwndEdit, SCI_GETLENGTH, 0, 0)); BOOL bIsTail = (iCurPos == iAnchorPos) && (iCurrLine >= (SciCall_GetLineCount() - 1)); + Encoding_SrcWeak(Encoding_Current(CPI_GET)); WCHAR tchFileName2[MAX_PATH] = { L'\0' }; @@ -7759,7 +7758,7 @@ BOOL FileSave(BOOL bSaveAlways,BOOL bAsk,BOOL bSaveAs,BOOL bSaveCopy) if (SaveFileDlg(g_hwndMain,tchFile,COUNTOF(tchFile),tchInitialDir)) { int fileEncoding = Encoding_Current(CPI_GET); - fSuccess = FileIO(FALSE, tchFile, FALSE, &fileEncoding, &iEOLMode, NULL, NULL, NULL, &bCancelDataLoss, bSaveCopy); + fSuccess = FileIO(FALSE, tchFile, FALSE, &fileEncoding, &g_iEOLMode, NULL, NULL, NULL, &bCancelDataLoss, bSaveCopy); //~if (fSuccess) Encoding_Current(fileEncoding); // save should not change encoding if (fSuccess) { @@ -7786,7 +7785,7 @@ BOOL FileSave(BOOL bSaveAlways,BOOL bAsk,BOOL bSaveAs,BOOL bSaveCopy) } else { int fileEncoding = Encoding_Current(CPI_GET); - fSuccess = FileIO(FALSE,g_wchCurFile,FALSE,&fileEncoding,&iEOLMode,NULL,NULL,NULL,&bCancelDataLoss,FALSE); + fSuccess = FileIO(FALSE,g_wchCurFile,FALSE,&fileEncoding,&g_iEOLMode,NULL,NULL,NULL,&bCancelDataLoss,FALSE); //~if (fSuccess) Encoding_Current(fileEncoding); // save should not change encoding } @@ -7825,7 +7824,7 @@ BOOL FileSave(BOOL bSaveAlways,BOOL bAsk,BOOL bSaveAs,BOOL bSaveCopy) if (GetTempPath(MAX_PATH,lpTempPathBuffer) && GetTempFileName(lpTempPathBuffer,TEXT("NP3"),0,szTempFileName)) { int fileEncoding = Encoding_Current(CPI_GET); - if (FileIO(FALSE,szTempFileName,FALSE,&fileEncoding,&iEOLMode,NULL,NULL,NULL,&bCancelDataLoss,TRUE)) { + if (FileIO(FALSE,szTempFileName,FALSE,&fileEncoding,&g_iEOLMode,NULL,NULL,NULL,&bCancelDataLoss,TRUE)) { //~Encoding_Current(fileEncoding); // save should not change encoding WCHAR szArguments[2048] = { L'\0' }; LPWSTR lpCmdLine = GetCommandLine(); diff --git a/src/Styles.c b/src/Styles.c index a680d5e49..facf6f978 100644 --- a/src/Styles.c +++ b/src/Styles.c @@ -2987,7 +2987,7 @@ static int g_cxStyleSelectDlg; static int g_cyStyleSelectDlg; -extern int iDefaultCharSet; +extern int g_iDefaultCharSet; extern BOOL bHiliteCurrentLine; extern BOOL bHyperlinkHotspot; @@ -5112,9 +5112,9 @@ BOOL Style_SelectFont(HWND hwnd,LPWSTR lpszStyle,int cchStyle, LPCWSTR sLexerNam } } - int iCharSet = iDefaultCharSet; + int iCharSet = g_iDefaultCharSet; if (!Style_StrGetCharSet(lpszStyle, &iCharSet)) { - iCharSet = iDefaultCharSet; + iCharSet = g_iDefaultCharSet; } // is "size:" definition relative ? @@ -5261,7 +5261,7 @@ BOOL Style_SelectFont(HWND hwnd,LPWSTR lpszStyle,int cchStyle, LPCWSTR sLexerNam if (bGlobalDefaultStyle && (lf.lfCharSet != DEFAULT_CHARSET) && (lf.lfCharSet != ANSI_CHARSET) && - (lf.lfCharSet != iDefaultCharSet)) { + (lf.lfCharSet != g_iDefaultCharSet)) { if (lf.lfCharSet == iCharSet) { if (StrStrI(lpszStyle, L"charset:")) { From 336d0ab2f15af8f6e652c29dc835923df4aa6cbc Mon Sep 17 00:00:00 2001 From: Rainer Kottenhoff Date: Thu, 1 Mar 2018 12:11:43 +0100 Subject: [PATCH 2/6] + fix: use Windows regional ANSI Code Page settings for fallback in case of file encoding detection failure (instead of default encoding for created files) --- src/Dialogs.c | 5 +++++ src/Edit.c | 27 ++++++++++----------------- src/Helpers.c | 5 +++-- src/Notepad3.c | 34 +++++++++++++++++++--------------- src/Notepad3.rc | 30 ++++++++++++++++-------------- src/resource.h | 1 + 6 files changed, 54 insertions(+), 48 deletions(-) diff --git a/src/Dialogs.c b/src/Dialogs.c index 778c7740d..8b7104114 100644 --- a/src/Dialogs.c +++ b/src/Dialogs.c @@ -55,6 +55,7 @@ extern WCHAR g_wchCurFile[]; extern WCHAR g_wchAppUserModelID[]; extern DWORD dwLastIOError; +extern BOOL bUseDefaultForFileEncoding; extern BOOL bSkipUnicodeDetection; extern BOOL bLoadASCIIasUTF8; extern BOOL bLoadNFOasOEM; @@ -2094,6 +2095,9 @@ INT_PTR CALLBACK SelectDefEncodingDlgProc(HWND hwnd,UINT umsg,WPARAM wParam,LPAR Encoding_AddToComboboxEx(GetDlgItem(hwnd,IDC_ENCODINGLIST),pdd->idEncoding,0); + if (bUseDefaultForFileEncoding) + CheckDlgButton(hwnd, IDC_USEASREADINGFALLBACK, BST_CHECKED); + if (bSkipUnicodeDetection) CheckDlgButton(hwnd,IDC_NOUNICODEDETECTION,BST_CHECKED); @@ -2121,6 +2125,7 @@ INT_PTR CALLBACK SelectDefEncodingDlgProc(HWND hwnd,UINT umsg,WPARAM wParam,LPAR EndDialog(hwnd,IDCANCEL); } else { + bUseDefaultForFileEncoding = (IsDlgButtonChecked(hwnd, IDC_USEASREADINGFALLBACK) == BST_CHECKED) ? 1 : 0; bSkipUnicodeDetection = (IsDlgButtonChecked(hwnd,IDC_NOUNICODEDETECTION) == BST_CHECKED) ? 1 : 0; bLoadASCIIasUTF8 = (IsDlgButtonChecked(hwnd,IDC_ASCIIASUTF8) == BST_CHECKED) ? 1 : 0; bLoadNFOasOEM = (IsDlgButtonChecked(hwnd,IDC_NFOASOEM) == BST_CHECKED) ? 1 : 0; diff --git a/src/Edit.c b/src/Edit.c index 21799bc8c..2f8bb26ae 100644 --- a/src/Edit.c +++ b/src/Edit.c @@ -64,6 +64,7 @@ extern DWORD dwLastIOError; extern UINT cpLastFind; extern BOOL bReplaceInitialized; extern BOOL bUseOldStyleBraceMatching; +extern BOOL bUseDefaultForFileEncoding; extern BOOL bSkipUnicodeDetection; extern BOOL bFindReplCopySelOrClip; @@ -81,7 +82,7 @@ extern BOOL bFixLineEndings; extern BOOL bAutoStripBlanks; // Default Codepage and Character Set -extern int g_iDefaultEncoding; +extern int g_iDefaultNewFileEncoding; extern int g_iDefaultCharSet; extern BOOL bLoadASCIIasUTF8; extern BOOL bLoadNFOasOEM; @@ -193,8 +194,8 @@ HWND EditCreate(HWND hwndParent) g_hInstance, NULL); - Encoding_Current(g_iDefaultEncoding); - Encoding_SciSetCodePage(hwnd,g_iDefaultEncoding); + Encoding_Current(g_iDefaultNewFileEncoding); + Encoding_SciSetCodePage(hwnd,g_iDefaultNewFileEncoding); SendMessage(hwnd,SCI_SETEOLMODE,SC_EOL_CRLF,0); SendMessage(hwnd,SCI_SETPASTECONVERTENDINGS,TRUE,0); SendMessage(hwnd,SCI_SETMODEVENTMASK,/*SC_MODEVENTMASKALL*/SC_MOD_INSERTTEXT|SC_MOD_DELETETEXT|SC_MOD_CONTAINER,0); @@ -1061,9 +1062,9 @@ BOOL EditLoadFile( } const int iFileEncoding = Encoding_SrcCmdLn(CPI_GET); - const int iPreferedEncoding = (bPreferOEM) ? g_DOSEncoding : - (Encoding_IsValid(Encoding_SrcWeak(CPI_GET)) ? Encoding_SrcWeak(CPI_GET) : g_iDefaultEncoding); - + const int iFileEncWeak = (Encoding_SrcWeak(CPI_GET) != CPI_NONE) ? Encoding_SrcWeak(CPI_GET) : CPI_ANSI_DEFAULT; + const int iPreferedEncoding = (bPreferOEM) ? g_DOSEncoding : (bUseDefaultForFileEncoding ? g_iDefaultNewFileEncoding : iFileEncWeak); + //@@@(g_Encodings[iFileEncWeak].uFlags & NCP_INTERNAL) ? g_iDefaultNewFileEncoding : iFileEncWeak; BOOL bBOM = FALSE; BOOL bReverse = FALSE; @@ -1176,6 +1177,7 @@ BOOL EditLoadFile( } else { + if (iFileEncoding != CPI_NONE) *iEncoding = iFileEncoding; else { @@ -1184,16 +1186,7 @@ BOOL EditLoadFile( if (fvCurFile.mask & FV_ENCODING) *iEncoding = CPI_ANSI_DEFAULT; else { - int iEncWeak = Encoding_SrcWeak(CPI_GET); - switch (iEncWeak) { - case CPI_NONE: - *iEncoding = iPreferedEncoding; - break; - default: - *iEncoding = (g_Encodings[iEncWeak].uFlags & NCP_INTERNAL) ? g_iDefaultEncoding : - (Encoding_IsValid(iEncWeak) ? iEncWeak : iPreferedEncoding); - break; - } + *iEncoding = iPreferedEncoding; } } } @@ -1236,7 +1229,7 @@ BOOL EditLoadFile( } } else { - *iEncoding = Encoding_IsValid(iFileEncoding) ? iFileEncoding : g_iDefaultEncoding; + *iEncoding = Encoding_IsValid(iFileEncoding) ? iFileEncoding : iPreferedEncoding; Encoding_SciSetCodePage(hwnd,*iEncoding); EditSetNewText(hwnd,"",0); EditSetNewText(hwnd,lpData,cbData); diff --git a/src/Helpers.c b/src/Helpers.c index e8d4271e9..a78275a89 100644 --- a/src/Helpers.c +++ b/src/Helpers.c @@ -42,8 +42,6 @@ extern HINSTANCE g_hInstance; -extern BOOL bSkipUnicodeDetection; -extern BOOL bPreserveCaretPos; //============================================================================= @@ -1824,6 +1822,8 @@ UINT CharSetFromCodePage(UINT uCodePage) { } +extern BOOL bPreserveCaretPos; + //============================================================================= // // MRU functions @@ -3405,6 +3405,7 @@ void Encoding_SciSetCodePage(HWND hwnd,int iEncoding) { */ } +extern BOOL bSkipUnicodeDetection; BOOL IsUnicode(const char* pBuffer,int cb,LPBOOL lpbBOM,LPBOOL lpbReverse) { int i = 0xFFFF; diff --git a/src/Notepad3.c b/src/Notepad3.c index 5abe671df..b77badf03 100644 --- a/src/Notepad3.c +++ b/src/Notepad3.c @@ -174,6 +174,7 @@ BOOL g_bCodeFoldingAvailable; BOOL g_bShowCodeFolding; BOOL bViewWhiteSpace; BOOL bViewEOLs; +BOOL bUseDefaultForFileEncoding; BOOL bSkipUnicodeDetection; BOOL bLoadASCIIasUTF8; BOOL bLoadNFOasOEM; @@ -254,7 +255,7 @@ LPMRULIST mruReplace; DWORD dwLastIOError; -int g_iDefaultEncoding; +int g_iDefaultNewFileEncoding; int g_iDefaultCharSet; int g_iEOLMode; @@ -2731,7 +2732,7 @@ LRESULT MsgCommand(HWND hwnd, WPARAM wParam, LPARAM lParam) case IDM_ENCODING_SETDEFAULT: - SelectDefEncodingDlg(hwnd,&g_iDefaultEncoding); + SelectDefEncodingDlg(hwnd,&g_iDefaultNewFileEncoding); break; @@ -4466,7 +4467,7 @@ LRESULT MsgCommand(HWND hwnd, WPARAM wParam, LPARAM lParam) { WCHAR tchCurFile2[MAX_PATH] = { L'\0' }; if (StringCchLenW(g_wchCurFile,COUNTOF(g_wchCurFile))) { - Encoding_SrcCmdLn(Encoding_MapUnicode(g_iDefaultEncoding)); + Encoding_SrcCmdLn(Encoding_MapUnicode(g_iDefaultNewFileEncoding)); StringCchCopy(tchCurFile2,COUNTOF(tchCurFile2),g_wchCurFile); FileLoad(FALSE,FALSE,TRUE,TRUE,tchCurFile2); } @@ -5796,10 +5797,12 @@ void LoadSettings() bViewEOLs = IniSectionGetBool(pIniSection,L"ViewEOLs", FALSE); - g_iDefaultEncoding = IniSectionGetInt(pIniSection,L"DefaultEncoding", CPI_NONE); + g_iDefaultNewFileEncoding = IniSectionGetInt(pIniSection,L"DefaultEncoding", CPI_NONE); // if DefaultEncoding is not defined set to system's current code-page - g_iDefaultEncoding = (g_iDefaultEncoding == CPI_NONE) ? - Encoding_MapIniSetting(TRUE,(int)GetACP()) : Encoding_MapIniSetting(TRUE,g_iDefaultEncoding); + g_iDefaultNewFileEncoding = (g_iDefaultNewFileEncoding == CPI_NONE) ? + Encoding_MapIniSetting(TRUE,(int)GetACP()) : Encoding_MapIniSetting(TRUE,g_iDefaultNewFileEncoding); + + bUseDefaultForFileEncoding = IniSectionGetBool(pIniSection, L"UseDefaultForFileEncoding", FALSE); bSkipUnicodeDetection = IniSectionGetBool(pIniSection, L"SkipUnicodeDetection", FALSE); @@ -6003,19 +6006,19 @@ void LoadSettings() // remove internal support for Chinese, Japan, Korean DBCS use UTF-8 instead /* - if (g_iDefaultEncoding == CPI_ANSI_DEFAULT) + if (g_iDefaultNewFileEncoding == CPI_ANSI_DEFAULT) { // check for Chinese, Japan, Korean DBCS code pages and switch accordingly int acp = (int)GetACP(); if (acp == 932 || acp == 936 || acp == 949 || acp == 950) { iSciDefaultCodePage = acp; } - g_iDefaultEncoding = Encoding_GetByCodePage(iSciDefaultCodePage); + g_iDefaultNewFileEncoding = Encoding_GetByCodePage(iSciDefaultCodePage); } */ // set flag for encoding default - g_Encodings[g_iDefaultEncoding].uFlags |= NCP_DEFAULT; + g_Encodings[g_iDefaultNewFileEncoding].uFlags |= NCP_DEFAULT; // define default charset g_iDefaultCharSet = (int)CharSetFromCodePage((UINT)iSciDefaultCodePage); @@ -6101,7 +6104,8 @@ void SaveSettings(BOOL bSaveSettingsNow) { IniSectionSetBool(pIniSection, L"MarkOccurrencesCurrentWord", bMarkOccurrencesCurrentWord); IniSectionSetBool(pIniSection, L"ViewWhiteSpace", bViewWhiteSpace); IniSectionSetBool(pIniSection, L"ViewEOLs", bViewEOLs); - IniSectionSetInt(pIniSection, L"DefaultEncoding", Encoding_MapIniSetting(FALSE, g_iDefaultEncoding)); + IniSectionSetInt(pIniSection, L"DefaultEncoding", Encoding_MapIniSetting(FALSE, g_iDefaultNewFileEncoding)); + IniSectionSetBool(pIniSection, L"UseDefaultForFileEncoding", bUseDefaultForFileEncoding); IniSectionSetBool(pIniSection, L"SkipUnicodeDetection", bSkipUnicodeDetection); IniSectionSetInt(pIniSection, L"LoadASCIIasUTF8", bLoadASCIIasUTF8); IniSectionSetBool(pIniSection, L"LoadNFOasOEM", bLoadNFOasOEM); @@ -7435,9 +7439,9 @@ BOOL FileLoad(BOOL bDontSave,BOOL bNew,BOOL bReload,BOOL bNoEncDetect,LPCWSTR lp g_iEOLMode = iLineEndings[g_iDefaultEOLMode]; SendMessage(g_hwndEdit,SCI_SETEOLMODE,iLineEndings[g_iDefaultEOLMode],0); - Encoding_Current(g_iDefaultEncoding); - Encoding_HasChanged(g_iDefaultEncoding); - Encoding_SciSetCodePage(g_hwndEdit,g_iDefaultEncoding); + Encoding_Current(g_iDefaultNewFileEncoding); + Encoding_HasChanged(g_iDefaultNewFileEncoding); + Encoding_SciSetCodePage(g_hwndEdit,g_iDefaultNewFileEncoding); EditSetNewText(g_hwndEdit,"",0); bReadOnly = FALSE; @@ -7511,8 +7515,8 @@ BOOL FileLoad(BOOL bDontSave,BOOL bNew,BOOL bReload,BOOL bNoEncDetect,LPCWSTR lp Encoding_HasChanged(fileEncoding); } else { - Encoding_Current(g_iDefaultEncoding); - Encoding_HasChanged(g_iDefaultEncoding); + Encoding_Current(g_iDefaultNewFileEncoding); + Encoding_HasChanged(g_iDefaultNewFileEncoding); } Encoding_SciSetCodePage(g_hwndEdit,Encoding_Current(CPI_GET)); bReadOnly = FALSE; diff --git a/src/Notepad3.rc b/src/Notepad3.rc index 73590ecdd..3a796c651 100644 --- a/src/Notepad3.rc +++ b/src/Notepad3.rc @@ -25,10 +25,10 @@ LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US // Icon with lowest ID value placed first to ensure application icon // remains consistent on all systems. -IDI_RUN ICON "..\\res\\Run.ico" - IDR_MAINWND ICON "..\\res\\Notepad3.ico" +IDI_RUN ICON "..\\res\\Run.ico" + IDR_MAINWND128 ICON "..\\res\\Notepad3_128.ico" IDI_STYLES ICON "..\\res\\Styles.ico" @@ -486,7 +486,7 @@ BEGIN "M", IDM_VIEW_MARGIN, VIRTKEY, SHIFT, CONTROL, NOINVERT "N", IDM_FILE_NEW, VIRTKEY, CONTROL, NOINVERT "N", IDM_FILE_NEWWINDOW, VIRTKEY, ALT, NOINVERT - "N", IDM_FILE_NEWWINDOW2, VIRTKEY, ALT, SHIFT, NOINVERT + "N", IDM_FILE_NEWWINDOW2, VIRTKEY, SHIFT, ALT, NOINVERT "N", IDM_VIEW_LINENUMBERS, VIRTKEY, SHIFT, CONTROL, NOINVERT "O", IDM_FILE_OPEN, VIRTKEY, CONTROL, NOINVERT "O", IDM_EDIT_SORTLINES, VIRTKEY, ALT, NOINVERT @@ -744,23 +744,25 @@ BEGIN SCROLLBAR IDC_RESIZEGRIP3,7,112,10,10 END -IDD_DEFENCODING DIALOGEX 0, 0, 180, 118 +IDD_DEFENCODING DIALOGEX 0, 0, 181, 122 STYLE DS_SETFONT | DS_MODALFRAME | DS_FIXEDSYS | WS_POPUP | WS_CAPTION | WS_SYSMENU CAPTION "Encoding" FONT 8, "MS Shell Dlg", 400, 0, 0x1 BEGIN - LTEXT "&Default encoding:",IDC_STATIC,7,7,58,8 - CONTROL "",IDC_ENCODINGLIST,"ComboBoxEx32",CBS_DROPDOWNLIST | WS_CLIPSIBLINGS | WS_VSCROLL | WS_TABSTOP,7,20,166,128 + LTEXT "&Default encoding (new file):",IDC_STATIC,7,7,90,8 + CONTROL "",IDC_ENCODINGLIST,"ComboBoxEx32",CBS_DROPDOWNLIST | WS_CLIPSIBLINGS | WS_VSCROLL | WS_TABSTOP,7,20,167,128 + CONTROL "Fallback on detection failure.", IDC_USEASREADINGFALLBACK, + "Button", BS_AUTOCHECKBOX | WS_TABSTOP, 7, 40, 108, 10 CONTROL "Skip automatic &Unicode detection.",IDC_NOUNICODEDETECTION, - "Button",BS_AUTOCHECKBOX | WS_TABSTOP,7,43,124,10 + "Button",BS_AUTOCHECKBOX | WS_TABSTOP,7,51,124,10 CONTROL "Open 7-bit &ASCII files in UTF-8 mode.",IDC_ASCIIASUTF8, - "Button",BS_AUTOCHECKBOX | WS_TABSTOP,7,53,136,10 + "Button",BS_AUTOCHECKBOX | WS_TABSTOP,7,62,136,10 CONTROL "Open 8-bit *.&nfo/diz files in DOS-437 mode.",IDC_NFOASOEM, - "Button",BS_AUTOCHECKBOX | WS_TABSTOP,7,63,166,10 + "Button",BS_AUTOCHECKBOX | WS_TABSTOP,7,73,167,10 CONTROL "Don't parse encoding &tags.",IDC_ENCODINGFROMFILEVARS, - "Button",BS_AUTOCHECKBOX | WS_TABSTOP,7,73,102,10 - DEFPUSHBUTTON "OK",IDOK,68,97,50,14 - PUSHBUTTON "Cancel",IDCANCEL,123,97,50,14 + "Button",BS_AUTOCHECKBOX | WS_TABSTOP,7,84,102,10 + DEFPUSHBUTTON "OK",IDOK,68,101,50,14 + PUSHBUTTON "Cancel",IDCANCEL,124,101,50,14 END IDD_DEFEOLMODE DIALOGEX 0, 0, 180, 78 @@ -1209,9 +1211,9 @@ BEGIN IDD_DEFENCODING, DIALOG BEGIN LEFTMARGIN, 7 - RIGHTMARGIN, 173 + RIGHTMARGIN, 174 TOPMARGIN, 7 - BOTTOMMARGIN, 101 + BOTTOMMARGIN, 115 END IDD_DEFEOLMODE, DIALOG diff --git a/src/resource.h b/src/resource.h index 60ad0f250..5e611296e 100644 --- a/src/resource.h +++ b/src/resource.h @@ -134,6 +134,7 @@ #define IDC_SWAPSTRG 230 #define IDC_CHECK_OCC 231 #define IDC_PRINTER 232 +#define IDC_USEASREADINGFALLBACK 233 #define IDACC_FIND 302 #define IDACC_REPLACE 303 #define IDACC_SAVEPOS 304 From 454c5b108a0623ebc71d4c94a9422a14c236ac66 Mon Sep 17 00:00:00 2001 From: Rainer Kottenhoff Date: Thu, 1 Mar 2018 16:17:11 +0100 Subject: [PATCH 3/6] + fix: Onigmo backward search + fix: use all ASCII line breaks for line termination ($) meta char --- onigmo/enc/utf_8.c | 32 ++++++++++++--------- onigmo/regenc.h | 9 +++--- scionigmo/OnigmoRegExEngine.cxx | 51 ++++++++++++--------------------- 3 files changed, 41 insertions(+), 51 deletions(-) diff --git a/onigmo/enc/utf_8.c b/onigmo/enc/utf_8.c index fae48adca..f5ec6b9ea 100644 --- a/onigmo/enc/utf_8.c +++ b/onigmo/enc/utf_8.c @@ -252,17 +252,21 @@ static int is_mbc_newline(const UChar* p, const UChar* end, OnigEncoding enc) { if (p < end) { - if ((*p == 0x0a) || (*p == 0x0d)) return 1; // LF or CR -#ifdef USE_UNICODE_ALL_LINE_TERMINATORS - if (*p == 0x0b || *p == 0x0c || *p == 0x0d) return 1; + if (*p == 0x0a) return 1; // LF + +#if defined(USE_ASCII_ALL_LINE_BREAKS) || defined(USE_UNICODE_ALL_LINE_TERMINATORS) + if (*p == 0x0b || *p == 0x0c || *p == 0x0d) return 1; // VT FF CR +#endif + +#ifdef USE_UNICODE_ALL_LINE_TERMINATORS if (p + 1 < end) { - if (*(p+1) == 0x85 && *p == 0xc2) /* U+0085 */ - return 1; + if (*(p + 1) == 0x85 && *p == 0xc2) /* U+0085 */ + return 1; if (p + 2 < end) { - if ((*(p+2) == 0xa8 || *(p+2) == 0xa9) - && *(p+1) == 0x80 && *p == 0xe2) /* U+2028, U+2029 */ - return 1; + if ((*(p + 2) == 0xa8 || *(p + 2) == 0xa9) + && *(p + 1) == 0x80 && *p == 0xe2) /* U+2028, U+2029 */ + return 1; } } #endif @@ -359,7 +363,7 @@ code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc ARG_UNUSED) static int mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, - const UChar* end, UChar* fold, OnigEncoding enc) + const UChar* end, UChar* fold, OnigEncoding enc) { const UChar* p = *pp; @@ -367,10 +371,10 @@ mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { if (*p == 0x49) { - *fold++ = 0xc4; - *fold = 0xb1; - (*pp)++; - return 2; + *fold++ = 0xc4; + *fold = 0xb1; + (*pp)++; + return 2; } } #endif @@ -387,7 +391,7 @@ mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, static int get_ctype_code_range(OnigCtype ctype, OnigCodePoint *sb_out, - const OnigCodePoint* ranges[], OnigEncoding enc ARG_UNUSED) + const OnigCodePoint* ranges[], OnigEncoding enc ARG_UNUSED) { *sb_out = 0x80; return onigenc_unicode_ctype_code_range(ctype, ranges); diff --git a/onigmo/regenc.h b/onigmo/regenc.h index 95f235dc4..b4f57732d 100644 --- a/onigmo/regenc.h +++ b/onigmo/regenc.h @@ -133,6 +133,7 @@ typedef struct { #define USE_UNICODE_PROPERTIES #define USE_UNICODE_AGE_PROPERTIES /* #define USE_UNICODE_CASE_FOLD_TURKISH_AZERI */ +#define USE_ASCII_ALL_LINE_BREAKS // LF, VT, FF, CR /* #define USE_UNICODE_ALL_LINE_TERMINATORS */ /* see Unicode.org UTS #18 */ @@ -187,8 +188,8 @@ ONIG_EXTERN int onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag, OnigA #define UTF16_IS_SURROGATE_SECOND(c) (((c) & 0xfc) == 0xdc) #define UTF16_IS_SURROGATE(c) (((c) & 0xf8) == 0xd8) #define UNICODE_VALID_CODEPOINT_P(c) ( \ - ((c) <= 0x10ffff) && \ - !((c) < 0x10000 && UTF16_IS_SURROGATE((c) >> 8))) + ((c) <= 0x10ffff) && \ + !((c) < 0x10000 && UTF16_IS_SURROGATE((c) >> 8))) #define ONIGENC_ISO_8859_1_TO_LOWER_CASE(c) \ OnigEncISO_8859_1_ToLowerCaseTable[c] @@ -234,8 +235,8 @@ extern int ONIG_ENC_REGISTER(const char *, OnigEncoding); # define OnigEncodingDefine(f,n) \ OnigEncodingDeclare(n); \ void Init_##f(void) { \ - ONIG_ENC_REGISTER(OnigEncodingName(n).name, \ - &OnigEncodingName(n)); \ + ONIG_ENC_REGISTER(OnigEncodingName(n).name, \ + &OnigEncodingName(n)); \ } \ OnigEncodingDeclare(n) #else diff --git a/scionigmo/OnigmoRegExEngine.cxx b/scionigmo/OnigmoRegExEngine.cxx index e8f22b9d3..ab24b8e0e 100644 --- a/scionigmo/OnigmoRegExEngine.cxx +++ b/scionigmo/OnigmoRegExEngine.cxx @@ -205,7 +205,7 @@ static void replaceAll(std::string& source, const std::string& from, const std:: * Has not been tested with backwards DBCS searches yet. */ long OnigmoRegExEngine::FindText(Document* doc, Sci::Position minPos, Sci::Position maxPos, const char *pattern, - bool caseSensitive, bool word, bool wordStart, int searchFlags, Sci::Position *length) + bool caseSensitive, bool word, bool wordStart, int searchFlags, Sci::Position *length) { if (!(pattern && (strlen(pattern) > 0))) { *length = 0; @@ -214,15 +214,18 @@ long OnigmoRegExEngine::FindText(Document* doc, Sci::Position minPos, Sci::Posit Sci::Position docLen = SciPos(doc->Length()); + const bool findForward = (minPos <= maxPos); + const int increment = findForward ? 1 : -1; + // Range endpoints should not be inside DBCS characters, but just in case, move them. - minPos = doc->MovePositionOutsideChar(minPos, 1, false); - maxPos = doc->MovePositionOutsideChar(maxPos, 1, false); - const bool findprevious = (minPos > maxPos); - Sci::Position rangeBeg = (findprevious) ? maxPos : minPos; - Sci::Position rangeEnd = (findprevious) ? minPos : maxPos; + minPos = doc->MovePositionOutsideChar(minPos, increment, false); + maxPos = doc->MovePositionOutsideChar(maxPos, increment, false); + + Sci::Position rangeBeg = (findForward) ? minPos : maxPos; + Sci::Position rangeEnd = (findForward) ? maxPos : minPos; Sci::Position rangeLen = (rangeEnd - rangeBeg); - + // ----------------------------- // --- Onigmo Engine Options --- // ----------------------------- @@ -241,7 +244,7 @@ long OnigmoRegExEngine::FindText(Document* doc, Sci::Position minPos, Sci::Posit else { ONIG_OPTION_OFF(onigmoOptions, ONIG_OPTION_DOTALL); } - + //ONIG_OPTION_ON(onigmoOptions, ONIG_OPTION_SINGLELINE); ONIG_OPTION_ON(onigmoOptions, ONIG_OPTION_NEGATE_SINGLELINE); @@ -257,8 +260,7 @@ long OnigmoRegExEngine::FindText(Document* doc, Sci::Position minPos, Sci::Posit bool bReCompile = (m_RegExpr == nullptr) || (m_CmplOptions != onigmoOptions) || (m_RegExprStrg.compare(sRegExprStrg) != 0); - if (bReCompile) - { + if (bReCompile) { m_RegExprStrg.clear(); m_RegExprStrg = sRegExprStrg; m_CmplOptions = onigmoOptions; @@ -289,12 +291,14 @@ long OnigmoRegExEngine::FindText(Document* doc, Sci::Position minPos, Sci::Posit UChar* docBegPtr = (UChar*)doc->RangePointer(0, docLen); UChar* docSEndPtr = (UChar*)doc->RangePointer(docLen, 0); UChar* rangeBegPtr = (UChar*)doc->RangePointer(rangeBeg, rangeLen); - UChar* rangeEndPtr = (UChar*)doc->RangePointer(rangeEnd, rangeLen); - + UChar* rangeEndPtr = (UChar*)doc->RangePointer(rangeEnd, 0); OnigPosition result = ONIG_MISMATCH; try { - result = onig_search(m_RegExpr, docBegPtr, docSEndPtr, rangeBegPtr, rangeEndPtr, &m_Region, onigmoOptions); + if (findForward) + result = onig_search(m_RegExpr, docBegPtr, docSEndPtr, rangeBegPtr, rangeEndPtr, &m_Region, onigmoOptions); + else // X // + result = onig_search(m_RegExpr, docBegPtr, docSEndPtr, rangeEndPtr, rangeBegPtr, &m_Region, onigmoOptions); } catch (...) { return Cast2long(-3); // -1 is normally used for not found, -3 is used here for exception @@ -305,26 +309,7 @@ long OnigmoRegExEngine::FindText(Document* doc, Sci::Position minPos, Sci::Posit return Cast2long(-3); } - if (findprevious) // search for last occurrence in range - { - //SPEEDUP: onig_scan() ??? - - while ((result >= 0) && (rangeBegPtr <= rangeEndPtr)) - { - m_MatchPos = SciPos(result); //SciPos(m_Region.beg[0]); - m_MatchLen = SciPos(m_Region.end[0] - result); - - rangeBegPtr = docBegPtr + (m_MatchPos + max(1,m_MatchLen)); - - try { - result = onig_search(m_RegExpr, docBegPtr, docSEndPtr, rangeBegPtr, rangeEndPtr, &m_Region, onigmoOptions); - } - catch (...) { - return Cast2long(-3); - } - } - } - else if ((result >= 0) && (rangeBegPtr <= rangeEndPtr)) + if ((result >= 0) && (rangeBegPtr <= rangeEndPtr)) { m_MatchPos = SciPos(result); //SciPos(m_Region.beg[0]); m_MatchLen = SciPos(m_Region.end[0] - result); From d84a00357255ba5d2f420055bdfa12f97add0560 Mon Sep 17 00:00:00 2001 From: Rainer Kottenhoff Date: Fri, 2 Mar 2018 00:33:03 +0100 Subject: [PATCH 4/6] + refactoring: prepare for enhanced encoding detection --- src/Dialogs.c | 2 + src/Edit.c | 83 +- src/Encoding.c | 1577 ++++++++++++++++++++++++++++++++++ src/Encoding.h | 139 +++ src/Helpers.c | 882 +------------------ src/Helpers.h | 81 +- src/Notepad3.c | 28 +- src/Notepad3.vcxproj | 2 + src/Notepad3.vcxproj.filters | 6 + src/Print.cpp | 6 +- src/Styles.c | 4 +- 11 files changed, 1794 insertions(+), 1016 deletions(-) create mode 100644 src/Encoding.c create mode 100644 src/Encoding.h diff --git a/src/Dialogs.c b/src/Dialogs.c index 8b7104114..2baba9a28 100644 --- a/src/Dialogs.c +++ b/src/Dialogs.c @@ -44,6 +44,8 @@ #include "resource.h" #include "version.h" #include "helpers.h" +#include "encoding.h" + #include "dialogs.h" diff --git a/src/Edit.c b/src/Edit.c index 2f8bb26ae..b6f456b79 100644 --- a/src/Edit.c +++ b/src/Edit.c @@ -32,6 +32,7 @@ #include #include #include + #include "scintilla.h" #include "scilexer.h" #include "notepad3.h" @@ -42,9 +43,13 @@ #include "../uthash/utarray.h" //#include "../uthash/utstring.h" #include "helpers.h" -#include "edit.h" +#include "encoding.h" + #include "SciCall.h" +#include "edit.h" + + #ifndef LCMAP_TITLECASE #define LCMAP_TITLECASE 0x00000300 // Title Case Letters bit mask #endif @@ -103,9 +108,6 @@ extern int g_iTabWidth; extern int g_iIndentWidth; -extern NP2ENCODING g_Encodings[]; - - #define DELIM_BUFFER 258 static char DelimChars[DELIM_BUFFER] = { '\0' }; static char DelimCharsAccel[DELIM_BUFFER] = { '\0' }; @@ -411,9 +413,9 @@ BOOL EditConvertText(HWND hwnd, int encSource, int encDest, BOOL bSetSavePoint) WCHAR* pwchText = GlobalAlloc(GPTR,wchBufSize); // MultiBytes(Sci) -> WideChar(destination) -> Sci(MultiByte) - //UINT cpSci = g_Encodings[encSource].uCodePage; + //UINT cpSci = Encoding_GetCodePage(encSource); UINT cpSci = Encoding_SciGetCodePage(hwnd); // fixed Scintilla internal (UTF-8) - UINT cpDst = g_Encodings[encDest].uCodePage; + UINT cpDst = Encoding_GetCodePage(encDest); // get text as wide char int cbwText = MultiByteToWideChar(cpSci,0, pchText, (int)length, pwchText, (int)wchBufSize); @@ -495,13 +497,13 @@ BOOL EditIsRecodingNeeded(WCHAR* pszText, int cchLen) if ((pszText == NULL) || (cchLen < 1)) return FALSE; - UINT codepage = g_Encodings[Encoding_Current(CPI_GET)].uCodePage; + UINT codepage = Encoding_GetCodePage(Encoding_Current(CPI_GET)); if ((codepage == CP_UTF7) || (codepage == CP_UTF8)) return FALSE; DWORD dwFlags = WC_NO_BEST_FIT_CHARS | WC_COMPOSITECHECK | WC_DEFAULTCHAR; - BOOL useNullParams = (g_Encodings[Encoding_Current(CPI_GET)].uFlags & NCP_MBCS) ? TRUE : FALSE; + BOOL useNullParams = Encoding_IsMBCS(Encoding_Current(CPI_GET)) ? TRUE : FALSE; BOOL bDefaultCharsUsed = FALSE; int cch = 0; @@ -1061,10 +1063,10 @@ BOOL EditLoadFile( bPreferOEM = TRUE; } - const int iFileEncoding = Encoding_SrcCmdLn(CPI_GET); + const int iForcedEncoding = Encoding_SrcCmdLn(CPI_GET); const int iFileEncWeak = (Encoding_SrcWeak(CPI_GET) != CPI_NONE) ? Encoding_SrcWeak(CPI_GET) : CPI_ANSI_DEFAULT; const int iPreferedEncoding = (bPreferOEM) ? g_DOSEncoding : (bUseDefaultForFileEncoding ? g_iDefaultNewFileEncoding : iFileEncWeak); - //@@@(g_Encodings[iFileEncWeak].uFlags & NCP_INTERNAL) ? g_iDefaultNewFileEncoding : iFileEncWeak; + //@@@ Encoding_IsINTERNAL(iFileEncWeak) ? g_iDefaultNewFileEncoding : iFileEncWeak; BOOL bBOM = FALSE; BOOL bReverse = FALSE; @@ -1072,36 +1074,35 @@ BOOL EditLoadFile( if (cbData == 0) { FileVars_Init(NULL,0,&fvCurFile); *iEOLMode = iLineEndings[g_iDefaultEOLMode]; - if (iFileEncoding == CPI_NONE) { + if (iForcedEncoding == CPI_NONE) { if (bLoadASCIIasUTF8 && !bPreferOEM) *iEncoding = CPI_UTF8; else *iEncoding = iPreferedEncoding; } else - *iEncoding = iFileEncoding; + *iEncoding = iForcedEncoding; Encoding_SciSetCodePage(hwnd,*iEncoding); EditSetNewText(hwnd,"",0); SendMessage(hwnd,SCI_SETEOLMODE,iLineEndings[g_iDefaultEOLMode],0); GlobalFree(lpData); } - else if (!bSkipEncodingDetection && - (iFileEncoding == CPI_NONE || iFileEncoding == CPI_UNICODE || iFileEncoding == CPI_UNICODEBE) && - (iFileEncoding == CPI_UNICODE || iFileEncoding == CPI_UNICODEBE || IsUnicode(lpData,cbData,&bBOM,&bReverse)) && - (iFileEncoding == CPI_UNICODE || iFileEncoding == CPI_UNICODEBE || !IsUTF8Signature(lpData))) // check for UTF-8 signature + (iForcedEncoding == CPI_NONE || iForcedEncoding == CPI_UNICODE || iForcedEncoding == CPI_UNICODEBE) && + (iForcedEncoding == CPI_UNICODE || iForcedEncoding == CPI_UNICODEBE || IsUnicode(lpData,cbData,&bBOM,&bReverse)) && + (iForcedEncoding == CPI_UNICODE || iForcedEncoding == CPI_UNICODEBE || !IsUTF8Signature(lpData))) // check for UTF-8 signature { char* lpDataUTF8; - if (iFileEncoding == CPI_UNICODE) { + if (iForcedEncoding == CPI_UNICODE) { bBOM = (*((UNALIGNED PWCHAR)lpData) == 0xFEFF); bReverse = FALSE; } - else if (iFileEncoding == CPI_UNICODEBE) + else if (iForcedEncoding == CPI_UNICODEBE) bBOM = (*((UNALIGNED PWCHAR)lpData) == 0xFFFE); - if (iFileEncoding == CPI_UNICODEBE || bReverse) { + if (iForcedEncoding == CPI_UNICODEBE || bReverse) { _swab(lpData,lpData,cbData); if (bBOM) *iEncoding = CPI_UNICODEBEBOM; @@ -1148,18 +1149,17 @@ BOOL EditLoadFile( else { FileVars_Init(lpData,cbData,&fvCurFile); - if (!bSkipEncodingDetection && (iFileEncoding == CPI_NONE || iFileEncoding == CPI_UTF8 || iFileEncoding == CPI_UTF8SIGN) && + if (!bSkipEncodingDetection && (iForcedEncoding == CPI_NONE || iForcedEncoding == CPI_UTF8 || iForcedEncoding == CPI_UTF8SIGN) && ((IsUTF8Signature(lpData) || FileVars_IsUTF8(&fvCurFile) || - (iFileEncoding == CPI_UTF8 || iFileEncoding == CPI_UTF8SIGN) || + (iForcedEncoding == CPI_UTF8 || iForcedEncoding == CPI_UTF8SIGN) || (!bPreferOEM && bLoadASCIIasUTF8) || // from menu "Reload As UTF-8" (IsUTF8(lpData,cbData) && (((UTF8_mbslen_bytes(UTF8StringStart(lpData)) - 1 != UTF8_mbslen(UTF8StringStart(lpData),IsUTF8Signature(lpData) ? cbData-3 : cbData)) || (!bPreferOEM && ( - g_Encodings[iPreferedEncoding].uFlags & NCP_UTF8 || - bLoadASCIIasUTF8))))))) && !(FileVars_IsNonUTF8(&fvCurFile) && - (iFileEncoding != CPI_UTF8 && iFileEncoding != CPI_UTF8SIGN))) + Encoding_IsUTF8(iPreferedEncoding) || bLoadASCIIasUTF8))))))) && !(FileVars_IsNonUTF8(&fvCurFile) && + (iForcedEncoding != CPI_UTF8 && iForcedEncoding != CPI_UTF8SIGN))) { Encoding_SciSetCodePage(hwnd,CPI_UTF8); EditSetNewText(hwnd,"",0); @@ -1178,8 +1178,8 @@ BOOL EditLoadFile( else { - if (iFileEncoding != CPI_NONE) - *iEncoding = iFileEncoding; + if (iForcedEncoding != CPI_NONE) + *iEncoding = iForcedEncoding; else { *iEncoding = FileVars_GetEncoding(&fvCurFile); if (*iEncoding == CPI_NONE) { @@ -1191,10 +1191,10 @@ BOOL EditLoadFile( } } - if (((g_Encodings[*iEncoding].uCodePage != CP_UTF7) && (g_Encodings[*iEncoding].uFlags & NCP_EXTERNAL_8BIT)) || - ((g_Encodings[*iEncoding].uCodePage == CP_UTF7) && IsUTF7(lpData,cbData))) { + if (((Encoding_GetCodePage(*iEncoding) != CP_UTF7) && Encoding_IsEXTERNAL_8BIT(*iEncoding)) || + ((Encoding_GetCodePage(*iEncoding) == CP_UTF7) && IsUTF7(lpData,cbData))) { - UINT uCodePage = g_Encodings[*iEncoding].uCodePage; + UINT uCodePage = Encoding_GetCodePage(*iEncoding); LPWSTR lpDataWide = GlobalAlloc(GPTR,cbData * 2 + 16); int cbDataWide = MultiByteToWideChar(uCodePage,0,lpData,cbData,lpDataWide,(int)GlobalSize(lpDataWide)/sizeof(WCHAR)); @@ -1229,7 +1229,7 @@ BOOL EditLoadFile( } } else { - *iEncoding = Encoding_IsValid(iFileEncoding) ? iFileEncoding : iPreferedEncoding; + *iEncoding = Encoding_IsValid(iForcedEncoding) ? iForcedEncoding : iPreferedEncoding; Encoding_SciSetCodePage(hwnd,*iEncoding); EditSetNewText(hwnd,"",0); EditSetNewText(hwnd,lpData,cbData); @@ -1343,19 +1343,19 @@ BOOL EditSaveFile( } }*/ - if (g_Encodings[iEncoding].uFlags & NCP_UNICODE) + if (Encoding_IsUNICODE(iEncoding)) { SetEndOfFile(hFile); LPWSTR lpDataWide = GlobalAlloc(GPTR, cbData * 2 + 16); int bomoffset = 0; - if (g_Encodings[iEncoding].uFlags & NCP_UNICODE_BOM) { + if (Encoding_IsUNICODE_BOM(iEncoding)) { const char* bom = "\xFF\xFE"; CopyMemory((char*)lpDataWide, bom, 2); bomoffset = 1; } int cbDataWide = bomoffset + MultiByteToWideChar(Encoding_SciGetCodePage(hwnd), 0, lpData, cbData, &lpDataWide[bomoffset], (int)GlobalSize(lpDataWide) / sizeof(WCHAR) - bomoffset); - if (g_Encodings[iEncoding].uFlags & NCP_UNICODE_REVERSE) { + if (Encoding_IsUNICODE_REVERSE(iEncoding)) { _swab((char*)lpDataWide, (char*)lpDataWide, cbDataWide * sizeof(WCHAR)); } bWriteSuccess = EncryptAndWriteFile(hwnd, hFile, (BYTE*)lpDataWide, cbDataWide * sizeof(WCHAR), &dwBytesWritten); @@ -1365,11 +1365,11 @@ BOOL EditSaveFile( GlobalFree(lpData); } - else if (g_Encodings[iEncoding].uFlags & NCP_UTF8) + else if (Encoding_IsUTF8(iEncoding)) { SetEndOfFile(hFile); - if (g_Encodings[iEncoding].uFlags & NCP_UTF8_SIGN) { + if (Encoding_IsUTF8_SIGN(iEncoding)) { const char* bom = "\xEF\xBB\xBF"; DWORD bomoffset = 3; MoveMemory(&lpData[bomoffset], lpData, cbData); @@ -1383,15 +1383,15 @@ BOOL EditSaveFile( GlobalFree(lpData); } - else if (g_Encodings[iEncoding].uFlags & NCP_EXTERNAL_8BIT) { + else if (Encoding_IsEXTERNAL_8BIT(iEncoding)) { BOOL bCancelDataLoss = FALSE; - UINT uCodePage = g_Encodings[iEncoding].uCodePage; + UINT uCodePage = Encoding_GetCodePage(iEncoding); LPWSTR lpDataWide = GlobalAlloc(GPTR,cbData * 2 + 16); int cbDataWide = MultiByteToWideChar(Encoding_SciGetCodePage(hwnd),0,lpData,cbData,lpDataWide,(int)GlobalSize(lpDataWide)/sizeof(WCHAR)); - if (g_Encodings[iEncoding].uFlags & NCP_MBCS) { + if (Encoding_IsMBCS(iEncoding)) { GlobalFree(lpData); lpData = GlobalAlloc(GPTR, GlobalSize(lpDataWide) * 2); // need more space cbData = WideCharToMultiByte(uCodePage, 0, lpDataWide, cbDataWide, lpData, (int)GlobalSize(lpData), NULL, NULL); @@ -7361,16 +7361,15 @@ BOOL FileVars_IsValidEncoding(LPFILEVARS lpfv) { if (lpfv->mask & FV_ENCODING && lpfv->iEncoding >= 0 && lpfv->iEncoding < Encoding_CountOf()) { - if ((g_Encodings[lpfv->iEncoding].uFlags & NCP_INTERNAL) || - IsValidCodePage(g_Encodings[lpfv->iEncoding].uCodePage) && - GetCPInfo(g_Encodings[lpfv->iEncoding].uCodePage,&cpi)) { + if ((Encoding_IsINTERNAL(lpfv->iEncoding)) || + IsValidCodePage(Encoding_GetCodePage(lpfv->iEncoding)) && + GetCPInfo(Encoding_GetCodePage(lpfv->iEncoding),&cpi)) { return(TRUE); } } return(FALSE); } - //============================================================================= // // FileVars_GetEncoding() diff --git a/src/Encoding.c b/src/Encoding.c new file mode 100644 index 000000000..2c91e51e3 --- /dev/null +++ b/src/Encoding.c @@ -0,0 +1,1577 @@ +/****************************************************************************** +* * +* * +* Notepad3 * +* * +* Encoding.c * +* General helper functions * +* Based on code from Notepad2, (c) Florian Balmer 1996-2011 * +* Parts taken from SciTE, (c) Neil Hodgson * +* MinimizeToTray, (c) 2000 Matthew Ellis * +* * +* (c) Rizonesoft 2015-2018 * +* https://rizonesoft.com * +* * +* * +*******************************************************************************/ + +#if !defined(WINVER) +#define WINVER 0x601 /*_WIN32_WINNT_WIN7*/ +#endif +#if !defined(_WIN32_WINNT) +#define _WIN32_WINNT 0x601 /*_WIN32_WINNT_WIN7*/ +#endif +#if !defined(NTDDI_VERSION) +#define NTDDI_VERSION 0x06010000 /*NTDDI_WIN7*/ +#endif +#define VC_EXTRALEAN 1 + +#include +#include +#include + +//#include "../uthash/utarray.h" + +#include "scintilla.h" +#include "helpers.h" +#include "encoding.h" + + +extern HINSTANCE g_hInstance; + + +//============================================================================= +// +// Encoding Helper Functions +// + +int g_DOSEncoding; + +// Supported Encodings +WCHAR wchANSI[16] = { L'\0' }; +WCHAR wchOEM[16] = { L'\0' }; + +static NP2ENCODING g_Encodings[] = { +{ NCP_ANSI | NCP_RECODE, CP_ACP, "ansi,system,ascii,", 61000, L"" }, +{ NCP_OEM | NCP_RECODE, CP_OEMCP, "oem,oem,", 61001, L"" }, +{ NCP_UNICODE | NCP_UNICODE_BOM, CP_UTF8, "", 61002, L"" }, +{ NCP_UNICODE | NCP_UNICODE_REVERSE | NCP_UNICODE_BOM, CP_UTF8, "", 61003, L"" }, +{ NCP_UNICODE | NCP_RECODE, CP_UTF8, "utf-16,utf16,unicode,", 61004, L"" }, +{ NCP_UNICODE | NCP_UNICODE_REVERSE | NCP_RECODE, CP_UTF8, "utf-16be,utf16be,unicodebe,", 61005, L"" }, +{ NCP_UTF8 | NCP_RECODE, CP_UTF8, "utf-8,utf8,", 61006, L"" }, +{ NCP_UTF8 | NCP_UTF8_SIGN, CP_UTF8, "utf-8,utf8,", 61007, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, CP_UTF7, "utf-7,utf7,", 61008, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 720, "DOS-720,dos720,", 61009, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28596, "iso-8859-6,iso88596,arabic,csisolatinarabic,ecma114,isoir127,", 61010, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10004, "x-mac-arabic,xmacarabic,", 61011, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1256, "windows-1256,windows1256,cp1256", 61012, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 775, "ibm775,ibm775,cp500,", 61013, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28594, "iso-8859-4,iso88594,csisolatin4,isoir110,l4,latin4,", 61014, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1257, "windows-1257,windows1257,", 61015, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 852, "ibm852,ibm852,cp852,", 61016, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28592, "iso-8859-2,iso88592,csisolatin2,isoir101,latin2,l2,", 61017, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10029, "x-mac-ce,xmacce,", 61018, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1250, "windows-1250,windows1250,xcp1250,", 61019, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 936, "gb2312,gb2312,chinese,cngb,csgb2312,csgb231280,gb231280,gbk,", 61020, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10008, "x-mac-chinesesimp,xmacchinesesimp,", 61021, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 950, "big5,big5,cnbig5,csbig5,xxbig5,", 61022, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10002, "x-mac-chinesetrad,xmacchinesetrad,", 61023, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10082, "x-mac-croatian,xmaccroatian,", 61024, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 866, "cp866,cp866,ibm866,", 61025, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28595, "iso-8859-5,iso88595,csisolatin5,csisolatincyrillic,cyrillic,isoir144,", 61026, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 20866, "koi8-r,koi8r,cskoi8r,koi,koi8,", 61027, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 21866, "koi8-u,koi8u,koi8ru,", 61028, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10007, "x-mac-cyrillic,xmaccyrillic,", 61029, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1251, "windows-1251,windows1251,xcp1251,", 61030, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28603, "iso-8859-13,iso885913,", 61031, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 863, "ibm863,ibm863,", 61032, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 737, "ibm737,ibm737,", 61033, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28597, "iso-8859-7,iso88597,csisolatingreek,ecma118,elot928,greek,greek8,isoir126,", 61034, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10006, "x-mac-greek,xmacgreek,", 61035, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1253, "windows-1253,windows1253,", 61036, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 869, "ibm869,ibm869,", 61037, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 862, "DOS-862,dos862,", 61038, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 38598, "iso-8859-8-i,iso88598i,logical,", 61039, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28598, "iso-8859-8,iso88598,csisolatinhebrew,hebrew,isoir138,visual,", 61040, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10005, "x-mac-hebrew,xmachebrew,", 61041, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1255, "windows-1255,windows1255,", 61042, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 861, "ibm861,ibm861,", 61043, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10079, "x-mac-icelandic,xmacicelandic,", 61044, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10001, "x-mac-japanese,xmacjapanese,", 61045, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 932, "shift_jis,shiftjis,shiftjs,csshiftjis,cswindows31j,mskanji,xmscp932,xsjis,", 61046, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10003, "x-mac-korean,xmackorean,", 61047, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 949, "windows-949,windows949,ksc56011987,csksc5601,euckr,isoir149,korean,ksc56011989", 61048, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28593, "iso-8859-3,iso88593,latin3,isoir109,l3,", 61049, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28605, "iso-8859-15,iso885915,latin9,l9,", 61050, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 865, "ibm865,ibm865,", 61051, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 437, "ibm437,ibm437,437,cp437,cspc8,codepage437,", 61052, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 858, "ibm858,ibm858,ibm00858,", 61053, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 860, "ibm860,ibm860,", 61054, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10010, "x-mac-romanian,xmacromanian,", 61055, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10021, "x-mac-thai,xmacthai,", 61056, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 874, "windows-874,windows874,dos874,iso885911,tis620,", 61057, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 857, "ibm857,ibm857,", 61058, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28599, "iso-8859-9,iso88599,latin5,isoir148,l5,", 61059, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10081, "x-mac-turkish,xmacturkish,", 61060, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1254, "windows-1254,windows1254,", 61061, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10017, "x-mac-ukrainian,xmacukrainian,", 61062, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1258, "windows-1258,windows-258,", 61063, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 850, "ibm850,ibm850,", 61064, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28591, "iso-8859-1,iso88591,cp819,latin1,ibm819,isoir100,latin1,l1,", 61065, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10000, "macintosh,macintosh,", 61066, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1252, "windows-1252,windows1252,cp367,cp819,ibm367,us,xansi,", 61067, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 37, "ebcdic-cp-us,ebcdiccpus,ebcdiccpca,ebcdiccpwt,ebcdiccpnl,ibm037,cp037,", 61068, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 500, "x-ebcdic-international,xebcdicinternational,", 61069, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 875, "x-EBCDIC-GreekModern,xebcdicgreekmodern,", 61070, L"" }, +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1026, "CP1026,cp1026,csibm1026,ibm1026,", 61071, L"" }, +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 870, "CP870,cp870,ebcdiccproece,ebcdiccpyu,csibm870,ibm870,", 00000, L"" }, // IBM EBCDIC (Multilingual Latin-2) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1047, "IBM01047,ibm01047,", 00000, L"" }, // IBM EBCDIC (Open System Latin-1) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1140, "x-ebcdic-cp-us-euro,xebcdiccpuseuro,", 00000, L"" }, // IBM EBCDIC (US-Canada-Euro) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1141, "x-ebcdic-germany-euro,xebcdicgermanyeuro,", 00000, L"" }, // IBM EBCDIC (Germany-Euro) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1142, "x-ebcdic-denmarknorway-euro,xebcdicdenmarknorwayeuro,", 00000, L"" }, // IBM EBCDIC (Denmark-Norway-Euro) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1143, "x-ebcdic-finlandsweden-euro,xebcdicfinlandswedeneuro,", 00000, L"" }, // IBM EBCDIC (Finland-Sweden-Euro) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1144, "x-ebcdic-italy-euro,xebcdicitalyeuro,", 00000, L"" }, // IBM EBCDIC (Italy-Euro) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1145, "x-ebcdic-spain-euro,xebcdicspaineuro,", 00000, L"" }, // IBM EBCDIC (Spain-Latin America-Euro) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1146, "x-ebcdic-uk-euro,xebcdicukeuro,", 00000, L"" }, // IBM EBCDIC (UK-Euro) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1147, "x-ebcdic-france-euro,xebcdicfranceeuro,", 00000, L"" }, // IBM EBCDIC (France-Euro) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1148, "x-ebcdic-international-euro,xebcdicinternationaleuro,", 00000, L"" }, // IBM EBCDIC (International-Euro) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1149, "x-ebcdic-icelandic-euro,xebcdicicelandiceuro,", 00000, L"" }, // IBM EBCDIC (Icelandic-Euro) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1361, "johab,johab,", 00000, L"" }, // Korean (Johab) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20273, "x-EBCDIC-Germany,xebcdicgermany,", 00000, L"" }, // IBM EBCDIC (Germany) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20277, "x-EBCDIC-DenmarkNorway,xebcdicdenmarknorway,ebcdiccpdk,ebcdiccpno,", 00000, L"" }, // IBM EBCDIC (Denmark-Norway) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20278, "x-EBCDIC-FinlandSweden,xebcdicfinlandsweden,ebcdicpfi,ebcdiccpse,", 00000, L"" }, // IBM EBCDIC (Finland-Sweden) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20280, "x-EBCDIC-Italy,xebcdicitaly,", 00000, L"" }, // IBM EBCDIC (Italy) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20284, "x-EBCDIC-Spain,xebcdicspain,ebcdiccpes,", 00000, L"" }, // IBM EBCDIC (Spain-Latin America) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20285, "x-EBCDIC-UK,xebcdicuk,ebcdiccpgb,", 00000, L"" }, // IBM EBCDIC (UK) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20290, "x-EBCDIC-JapaneseKatakana,xebcdicjapanesekatakana,", 00000, L"" }, // IBM EBCDIC (Japanese Katakana) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20297, "x-EBCDIC-France,xebcdicfrance,ebcdiccpfr,", 00000, L"" }, // IBM EBCDIC (France) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20420, "x-EBCDIC-Arabic,xebcdicarabic,ebcdiccpar1,", 00000, L"" }, // IBM EBCDIC (Arabic) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20423, "x-EBCDIC-Greek,xebcdicgreek,ebcdiccpgr,", 00000, L"" }, // IBM EBCDIC (Greek) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20424, "x-EBCDIC-Hebrew,xebcdichebrew,ebcdiccphe,", 00000, L"" }, // IBM EBCDIC (Hebrew) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20833, "x-EBCDIC-KoreanExtended,xebcdickoreanextended,", 00000, L"" }, // IBM EBCDIC (Korean Extended) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20838, "x-EBCDIC-Thai,xebcdicthai,ibmthai,csibmthai,", 00000, L"" }, // IBM EBCDIC (Thai) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20871, "x-EBCDIC-Icelandic,xebcdicicelandic,ebcdiccpis,", 00000, L"" }, // IBM EBCDIC (Icelandic) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20880, "x-EBCDIC-CyrillicRussian,xebcdiccyrillicrussian,ebcdiccyrillic,", 00000, L"" }, // IBM EBCDIC (Cyrillic Russian) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20905, "x-EBCDIC-Turkish,xebcdicturkish,ebcdiccptr,", 00000, L"" }, // IBM EBCDIC (Turkish) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20924, "IBM00924,ibm00924,ebcdiclatin9euro,", 00000, L"" }, // IBM EBCDIC (Open System-Euro Latin-1) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 21025, "x-EBCDIC-CyrillicSerbianBulgarian,xebcdiccyrillicserbianbulgarian,", 00000, L"" }, // IBM EBCDIC (Cyrillic Serbian-Bulgarian) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50930, "x-EBCDIC-JapaneseAndKana,xebcdicjapaneseandkana,", 00000, L"" }, // IBM EBCDIC (Japanese and Japanese Katakana) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50931, "x-EBCDIC-JapaneseAndUSCanada,xebcdicjapaneseanduscanada,", 00000, L"" }, // IBM EBCDIC (Japanese and US-Canada) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50933, "x-EBCDIC-KoreanAndKoreanExtended,xebcdickoreanandkoreanextended,", 00000, L"" }, // IBM EBCDIC (Korean and Korean Extended) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50935, "x-EBCDIC-SimplifiedChinese,xebcdicsimplifiedchinese,", 00000, L"" }, // IBM EBCDIC (Chinese Simplified) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50937, "x-EBCDIC-TraditionalChinese,xebcdictraditionalchinese,", 00000, L"" }, // IBM EBCDIC (Chinese Traditional) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50939, "x-EBCDIC-JapaneseAndJapaneseLatin,xebcdicjapaneseandjapaneselatin,", 00000, L"" }, // IBM EBCDIC (Japanese and Japanese-Latin) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20105, "x-IA5,xia5,", 00000, L"" }, // Western European (IA5) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20106, "x-IA5-German,xia5german,", 00000, L"" }, // German (IA5) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20107, "x-IA5-Swedish,xia5swedish,", 00000, L"" }, // Swedish (IA5) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20108, "x-IA5-Norwegian,xia5norwegian,", 00000, L"" }, // Norwegian (IA5) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20936, "x-cp20936,xcp20936,", 00000, L"" }, // Chinese Simplified (GB2312) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20932, "euc-jp,,", 00000, L"" }, // Japanese (JIS X 0208-1990 & 0212-1990) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50220, "iso-2022-jp,iso2022jp,", 00000, L"" }, // Japanese (JIS) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50221, "csISO2022JP,csiso2022jp,", 00000, L"" }, // Japanese (JIS-Allow 1 byte Kana) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50222, "_iso-2022-jp$SIO,iso2022jpSIO,", 00000, L"" }, // Japanese (JIS-Allow 1 byte Kana - SO/SI) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50225, "iso-2022-kr,iso2022kr,csiso2022kr,", 00000, L"" }, // Korean (ISO-2022-KR) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50227, "x-cp50227,xcp50227,", 00000, L"" }, // Chinese Simplified (ISO-2022) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50229, "iso-2022-cn,iso2022cn,", 00000, L"" }, // Chinese Traditional (ISO-2022) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20000, "x-Chinese-CNS,xchinesecns,", 00000, L"" }, // Chinese Traditional (CNS) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20002, "x-Chinese-Eten,xchineseeten,", 00000, L"" }, // Chinese Traditional (Eten) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 51932, "euc-jp,eucjp,xeuc,xeucjp,", 00000, L"" }, // Japanese (EUC) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 51936, "euc-cn,euccn,xeuccn,", 00000, L"" }, // Chinese Simplified (EUC) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 51949, "euc-kr,euckr,cseuckr,", 00000, L"" }, // Korean (EUC) +//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 52936, "hz-gb-2312,hzgb2312,hz,", 00000, L"" }, // Chinese Simplified (HZ-GB2312) +{ NCP_EXTERNAL_8BIT | NCP_RECODE, 54936, "gb18030,gb18030,", 61072, L"" } // Chinese Simplified (GB18030) + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57002, "x-iscii-de,xisciide,", 00000, L"" }, // ISCII Devanagari + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57003, "x-iscii-be,xisciibe,", 00000, L"" }, // ISCII Bengali + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57004, "x-iscii-ta,xisciita,", 00000, L"" }, // ISCII Tamil + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57005, "x-iscii-te,xisciite,", 00000, L"" }, // ISCII Telugu + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57006, "x-iscii-as,xisciias,", 00000, L"" }, // ISCII Assamese + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57007, "x-iscii-or,xisciior,", 00000, L"" }, // ISCII Oriya + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57008, "x-iscii-ka,xisciika,", 00000, L"" }, // ISCII Kannada + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57009, "x-iscii-ma,xisciima,", 00000, L"" }, // ISCII Malayalam + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57010, "x-iscii-gu,xisciigu,", 00000, L"" }, // ISCII Gujarathi + //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57011, "x-iscii-pa,xisciipa,", 00000, L"" }, // ISCII Panjabi +}; + +int Encoding_CountOf() { + return COUNTOF(g_Encodings); +} + +int Encoding_Current(int iEncoding) { + static int CurrentEncoding = CPI_NONE; + + if (iEncoding >= 0) { + if (Encoding_IsValid(iEncoding)) + CurrentEncoding = iEncoding; + else + CurrentEncoding = CPI_UTF8; + } + return CurrentEncoding; +} + + +int Encoding_SrcCmdLn(int iSrcEncoding) { + static int SourceEncoding = CPI_NONE; + + if (iSrcEncoding >= 0) { + if (Encoding_IsValid(iSrcEncoding)) + SourceEncoding = iSrcEncoding; + else + SourceEncoding = CPI_UTF8; + } + else if (iSrcEncoding == CPI_NONE) { + SourceEncoding = CPI_NONE; + } + return SourceEncoding; +} + + +int Encoding_SrcWeak(int iSrcWeakEnc) { + static int SourceWeakEncoding = CPI_NONE; + + if (iSrcWeakEnc >= 0) { + if (Encoding_IsValid(iSrcWeakEnc)) + SourceWeakEncoding = iSrcWeakEnc; + else + SourceWeakEncoding = CPI_ANSI_DEFAULT; + } + else if (iSrcWeakEnc == CPI_NONE) { + SourceWeakEncoding = CPI_NONE; + } + return SourceWeakEncoding; +} + + +BOOL Encoding_HasChanged(int iOriginalEncoding) { + static int OriginalEncoding = CPI_NONE; + + if (iOriginalEncoding >= CPI_NONE) { + OriginalEncoding = iOriginalEncoding; + } + return (BOOL)(OriginalEncoding != Encoding_Current(CPI_GET)); +} +// ============================================================================ + + +void Encoding_InitDefaults() + +{ + const UINT uCodePageMBCS[20] = { + 42, // (Symbol) + 50220,50221,50222,50225,50227,50229, // (Chinese, Japanese, Korean) + 54936, // (GB18030) + 57002,57003,57004,57005,57006,57007,57008,57009,57010,57011, // (ISCII) + 65000, // (UTF-7) + 65001 // (UTF-8) + }; + + g_Encodings[CPI_ANSI_DEFAULT].uCodePage = GetACP(); // set ANSI system CP + StringCchPrintf(wchANSI, COUNTOF(wchANSI), L" (CP-%u)", g_Encodings[CPI_ANSI_DEFAULT].uCodePage); + + for (int i = CPI_UTF7 + 1; i < COUNTOF(g_Encodings); ++i) { + if (Encoding_IsValid(i) && (g_Encodings[i].uCodePage == g_Encodings[CPI_ANSI_DEFAULT].uCodePage)) { + g_Encodings[i].uFlags |= NCP_ANSI; + if (g_Encodings[i].uFlags & NCP_EXTERNAL_8BIT) + g_Encodings[CPI_ANSI_DEFAULT].uFlags |= NCP_EXTERNAL_8BIT; + break; + } + } + + g_Encodings[CPI_OEM].uCodePage = GetOEMCP(); + StringCchPrintf(wchOEM, COUNTOF(wchOEM), L" (CP-%u)", g_Encodings[CPI_OEM].uCodePage); + + for (int i = CPI_UTF7 + 1; i < COUNTOF(g_Encodings); ++i) { + if (Encoding_IsValid(i) && (g_Encodings[i].uCodePage == g_Encodings[CPI_OEM].uCodePage)) { + g_Encodings[i].uFlags |= NCP_OEM; + if (g_Encodings[i].uFlags & NCP_EXTERNAL_8BIT) + g_Encodings[CPI_OEM].uFlags |= NCP_EXTERNAL_8BIT; + break; + } + } + + // multi byte character sets + for (int i = 0; i < COUNTOF(g_Encodings); ++i) { + for (int k = 0; k < COUNTOF(uCodePageMBCS); k++) { + if (g_Encodings[i].uCodePage == uCodePageMBCS[k]) { + g_Encodings[i].uFlags |= NCP_MBCS; + } + } + } + + g_DOSEncoding = CPI_OEM; + // Try to set the DOS encoding to DOS-437 if the default OEMCP is not DOS-437 + if (g_Encodings[g_DOSEncoding].uCodePage != 437) { + for (int i = CPI_UTF7 + 1; i < COUNTOF(g_Encodings); ++i) { + if (Encoding_IsValid(i) && (g_Encodings[i].uCodePage == 437)) { + g_DOSEncoding = i; + break; + } + } + } +} +// ============================================================================ + + +int Encoding_MapIniSetting(BOOL bLoad, int iSetting) { + if (bLoad) { + switch (iSetting) { + case -1: return CPI_NONE; + case 0: return CPI_ANSI_DEFAULT; + case 1: return CPI_UNICODEBOM; + case 2: return CPI_UNICODEBEBOM; + case 3: return CPI_UTF8; + case 4: return CPI_UTF8SIGN; + case 5: return CPI_OEM; + case 6: return CPI_UNICODE; + case 7: return CPI_UNICODEBE; + case 8: return CPI_UTF7; + default: { + for (int i = CPI_UTF7 + 1; i < COUNTOF(g_Encodings); i++) { + if ((g_Encodings[i].uCodePage == (UINT)iSetting) && Encoding_IsValid(i)) + return(i); + } + return CPI_ANSI_DEFAULT; + } + } + } + else { + switch (iSetting) { + case CPI_NONE: return -1; + case CPI_ANSI_DEFAULT: return 0; + case CPI_UNICODEBOM: return 1; + case CPI_UNICODEBEBOM: return 2; + case CPI_UTF8: return 3; + case CPI_UTF8SIGN: return 4; + case CPI_OEM: return 5; + case CPI_UNICODE: return 6; + case CPI_UNICODEBE: return 7; + case CPI_UTF7: return 8; + default: { + if (Encoding_IsValid(iSetting)) + return(g_Encodings[iSetting].uCodePage); + else + return CPI_ANSI_DEFAULT; + } + } + } +} +// ============================================================================ + + +int Encoding_MapUnicode(int iUni) { + + if (iUni == CPI_UNICODEBOM) + return CPI_UNICODE; + else if (iUni == CPI_UNICODEBEBOM) + return CPI_UNICODEBE; + else if (iUni == CPI_UTF8SIGN) + return CPI_UTF8; + else + return iUni; +} +// ============================================================================ + + +void Encoding_SetLabel(int iEncoding) { + if (g_Encodings[iEncoding].wchLabel[0] == L'\0') { + WCHAR wch1[128] = { L'\0' }; + WCHAR wch2[128] = { L'\0' }; + GetString(g_Encodings[iEncoding].idsName, wch1, COUNTOF(wch1)); + WCHAR *pwsz = StrChr(wch1, L';'); + if (pwsz) { + pwsz = StrChr(CharNext(pwsz), L';'); + if (pwsz) { + pwsz = CharNext(pwsz); + } + } + if (!pwsz) + pwsz = wch1; + + StringCchCopyN(wch2, COUNTOF(wch2), pwsz, COUNTOF(wch1)); + + if (Encoding_IsANSI(iEncoding)) + StringCchCatN(wch2, COUNTOF(wch2), wchANSI, COUNTOF(wchANSI)); + else if (Encoding_IsOEM(iEncoding)) + StringCchCatN(wch2, COUNTOF(wch2), wchOEM, COUNTOF(wchOEM)); + + StringCchCopyN(g_Encodings[iEncoding].wchLabel, COUNTOF(g_Encodings[iEncoding].wchLabel), + wch2, COUNTOF(g_Encodings[iEncoding].wchLabel)); + } +} +// ============================================================================ + + +int Encoding_MatchW(LPCWSTR pwszTest) { + char tchTest[256] = { '\0' }; + WideCharToMultiByteStrg(CP_ACP, pwszTest, tchTest); + return(Encoding_MatchA(tchTest)); +} +// ============================================================================ + + +int Encoding_MatchA(char *pchTest) { + char chTest[256] = { '\0' }; + char *pchSrc = pchTest; + char *pchDst = chTest; + *pchDst++ = ','; + while (*pchSrc) { + if (IsCharAlphaNumericA(*pchSrc)) + *pchDst++ = *CharLowerA(pchSrc); + pchSrc++; + } + *pchDst++ = ','; + *pchDst = 0; + for (int i = 0; i < COUNTOF(g_Encodings); i++) { + if (StrStrIA(g_Encodings[i].pszParseNames, chTest)) { + CPINFO cpi; + if ((g_Encodings[i].uFlags & NCP_INTERNAL) || + IsValidCodePage(g_Encodings[i].uCodePage) && + GetCPInfo(g_Encodings[i].uCodePage, &cpi)) + return(i); + else + return(-1); + } + } + return(-1); +} +// ============================================================================ + + +int Encoding_GetByCodePage(UINT cp) { + for (int i = 0; i < COUNTOF(g_Encodings); i++) { + if (cp == g_Encodings[i].uCodePage) { + return i; + } + } + return CPI_ANSI_DEFAULT; +} +// ============================================================================ + + +BOOL Encoding_IsValid(int iTestEncoding) { + CPINFO cpi; + if ((iTestEncoding >= 0) && (iTestEncoding < COUNTOF(g_Encodings))) { + if ((g_Encodings[iTestEncoding].uFlags & NCP_INTERNAL) || + IsValidCodePage(g_Encodings[iTestEncoding].uCodePage) && + GetCPInfo(g_Encodings[iTestEncoding].uCodePage, &cpi)) { + return(TRUE); + } + } + return(FALSE); +} +// ============================================================================ + + +typedef struct _ee { + int id; + WCHAR wch[256]; +} ENCODINGENTRY, *PENCODINGENTRY; + +int CmpEncoding(const void *s1, const void *s2) { + return StrCmp(((PENCODINGENTRY)s1)->wch, ((PENCODINGENTRY)s2)->wch); +} +// ============================================================================ + + +void Encoding_AddToListView(HWND hwnd, int idSel, BOOL bRecodeOnly) { + int i; + int iSelItem = -1; + LVITEM lvi; + WCHAR wchBuf[256] = { L'\0' }; + + PENCODINGENTRY pEE = LocalAlloc(LPTR, COUNTOF(g_Encodings) * sizeof(ENCODINGENTRY)); + for (i = 0; i < COUNTOF(g_Encodings); i++) { + pEE[i].id = i; + GetString(g_Encodings[i].idsName, pEE[i].wch, COUNTOF(pEE[i].wch)); + } + qsort(pEE, COUNTOF(g_Encodings), sizeof(ENCODINGENTRY), CmpEncoding); + + ZeroMemory(&lvi, sizeof(LVITEM)); + lvi.mask = LVIF_PARAM | LVIF_TEXT | LVIF_IMAGE; + lvi.pszText = wchBuf; + + for (i = 0; i < COUNTOF(g_Encodings); i++) { + + int id = pEE[i].id; + if (!bRecodeOnly || (g_Encodings[id].uFlags & NCP_RECODE)) { + + lvi.iItem = ListView_GetItemCount(hwnd); + + WCHAR *pwsz = StrChr(pEE[i].wch, L';'); + if (pwsz) { + StringCchCopyN(wchBuf, COUNTOF(wchBuf), CharNext(pwsz), COUNTOF(wchBuf)); + pwsz = StrChr(wchBuf, L';'); + if (pwsz) + *pwsz = 0; + } + else + StringCchCopyN(wchBuf, COUNTOF(wchBuf), pEE[i].wch, COUNTOF(wchBuf)); + + if (Encoding_IsANSI(id)) + StringCchCatN(wchBuf, COUNTOF(wchBuf), wchANSI, COUNTOF(wchANSI)); + else if (Encoding_IsOEM(id)) + StringCchCatN(wchBuf, COUNTOF(wchBuf), wchOEM, COUNTOF(wchOEM)); + + if (Encoding_IsValid(id)) + lvi.iImage = 0; + else + lvi.iImage = 1; + + lvi.lParam = (LPARAM)id; + ListView_InsertItem(hwnd, &lvi); + + if (idSel == id) + iSelItem = lvi.iItem; + } + } + + LocalFree(pEE); + + if (iSelItem != -1) { + ListView_SetItemState(hwnd, iSelItem, LVIS_SELECTED | LVIS_FOCUSED, LVIS_SELECTED | LVIS_FOCUSED); + ListView_EnsureVisible(hwnd, iSelItem, FALSE); + } + else { + ListView_SetItemState(hwnd, 0, LVIS_FOCUSED, LVIS_FOCUSED); + ListView_EnsureVisible(hwnd, 0, FALSE); + } +} +// ============================================================================ + + +BOOL Encoding_GetFromListView(HWND hwnd, int *pidEncoding) { + LVITEM lvi; + + lvi.iItem = ListView_GetNextItem(hwnd, -1, LVNI_ALL | LVNI_SELECTED); + lvi.iSubItem = 0; + lvi.mask = LVIF_PARAM; + + if (ListView_GetItem(hwnd, &lvi)) { + if (Encoding_IsValid((int)lvi.lParam)) + *pidEncoding = (int)lvi.lParam; + else + *pidEncoding = -1; + + return (TRUE); + } + return(FALSE); +} +// ============================================================================ + + +void Encoding_AddToComboboxEx(HWND hwnd, int idSel, BOOL bRecodeOnly) { + int i; + int iSelItem = -1; + COMBOBOXEXITEM cbei; + WCHAR wchBuf[256] = { L'\0' }; + + PENCODINGENTRY pEE = LocalAlloc(LPTR, COUNTOF(g_Encodings) * sizeof(ENCODINGENTRY)); + for (i = 0; i < COUNTOF(g_Encodings); i++) { + pEE[i].id = i; + GetString(g_Encodings[i].idsName, pEE[i].wch, COUNTOF(pEE[i].wch)); + } + qsort(pEE, COUNTOF(g_Encodings), sizeof(ENCODINGENTRY), CmpEncoding); + + ZeroMemory(&cbei, sizeof(COMBOBOXEXITEM)); + cbei.mask = CBEIF_TEXT | CBEIF_IMAGE | CBEIF_SELECTEDIMAGE | CBEIF_LPARAM; + cbei.pszText = wchBuf; + cbei.cchTextMax = COUNTOF(wchBuf); + cbei.iImage = 0; + cbei.iSelectedImage = 0; + + for (i = 0; i < COUNTOF(g_Encodings); i++) { + + int id = pEE[i].id; + if (!bRecodeOnly || (g_Encodings[id].uFlags & NCP_RECODE)) { + + cbei.iItem = SendMessage(hwnd, CB_GETCOUNT, 0, 0); + + WCHAR *pwsz = StrChr(pEE[i].wch, L';'); + if (pwsz) { + StringCchCopyN(wchBuf, COUNTOF(wchBuf), CharNext(pwsz), COUNTOF(wchBuf)); + pwsz = StrChr(wchBuf, L';'); + if (pwsz) + *pwsz = 0; + } + else + StringCchCopyN(wchBuf, COUNTOF(wchBuf), pEE[i].wch, COUNTOF(wchBuf)); + + if (Encoding_IsANSI(id)) + StringCchCatN(wchBuf, COUNTOF(wchBuf), wchANSI, COUNTOF(wchANSI)); + else if (id == CPI_OEM) + StringCchCatN(wchBuf, COUNTOF(wchBuf), wchOEM, COUNTOF(wchOEM)); + + cbei.iImage = (Encoding_IsValid(id) ? 0 : 1); + + cbei.lParam = (LPARAM)id; + SendMessage(hwnd, CBEM_INSERTITEM, 0, (LPARAM)&cbei); + + if (idSel == id) + iSelItem = (int)cbei.iItem; + } + } + + LocalFree(pEE); + + if (iSelItem != -1) + SendMessage(hwnd, CB_SETCURSEL, (WPARAM)iSelItem, 0); +} +// ============================================================================ + + +BOOL Encoding_GetFromComboboxEx(HWND hwnd, int *pidEncoding) { + COMBOBOXEXITEM cbei; + + cbei.iItem = SendMessage(hwnd, CB_GETCURSEL, 0, 0); + cbei.mask = CBEIF_LPARAM; + + if (SendMessage(hwnd, CBEM_GETITEM, 0, (LPARAM)&cbei)) { + if (Encoding_IsValid((int)cbei.lParam)) + *pidEncoding = (int)cbei.lParam; + else + *pidEncoding = -1; + + return (TRUE); + } + return(FALSE); +} +// ============================================================================ + + +UINT Encoding_GetCodePage(int iEncoding) { + return g_Encodings[iEncoding].uCodePage; +} +// ============================================================================ + +BOOL Encoding_IsDefault(int iEncoding) { + return (g_Encodings[iEncoding].uFlags & NCP_DEFAULT); +} +// ============================================================================ + +BOOL Encoding_IsANSI(int iEncoding) { + return (g_Encodings[iEncoding].uFlags & NCP_ANSI); +} +// ============================================================================ + +BOOL Encoding_IsOEM(int iEncoding) { + return (g_Encodings[iEncoding].uFlags & NCP_OEM); +} +// ============================================================================ + +BOOL Encoding_IsUTF8(int iEncoding) { + return (g_Encodings[iEncoding].uFlags & NCP_UTF8); +} +// ============================================================================ + +BOOL Encoding_IsUTF8_SIGN(int iEncoding) { + return (g_Encodings[iEncoding].uFlags & NCP_UTF8_SIGN); +} +// ============================================================================ + +BOOL Encoding_IsMBCS(int iEncoding) { + return (g_Encodings[iEncoding].uFlags & NCP_MBCS); +} +// ============================================================================ + +BOOL Encoding_IsUNICODE(int iEncoding) { + return (g_Encodings[iEncoding].uFlags & NCP_UNICODE); +} +// ============================================================================ + +BOOL Encoding_IsUNICODE_BOM(int iEncoding) { + return (g_Encodings[iEncoding].uFlags & NCP_UNICODE_BOM); +} +// ============================================================================ + +BOOL Encoding_IsUNICODE_REVERSE(int iEncoding) { + return (g_Encodings[iEncoding].uFlags & NCP_UNICODE_REVERSE); +} +// ============================================================================ + + +BOOL Encoding_IsINTERNAL(int iEncoding) { + return (g_Encodings[iEncoding].uFlags & NCP_INTERNAL); +} +// ============================================================================ + +BOOL Encoding_IsEXTERNAL_8BIT(int iEncoding) { + return (g_Encodings[iEncoding].uFlags & NCP_EXTERNAL_8BIT); +} +// ============================================================================ + +BOOL Encoding_IsRECODE(int iEncoding) { + return (g_Encodings[iEncoding].uFlags & NCP_RECODE); +} +// ============================================================================ + + +void Encoding_SetDefaultFlag(int iEncoding) { + g_Encodings[iEncoding].uFlags |= NCP_DEFAULT; +} +// ============================================================================ + + +const WCHAR* Encoding_GetLabel(int iEncoding) { + return g_Encodings[iEncoding].wchLabel; +} +// ============================================================================ + +const char* Encoding_GetParseNames(int iEncoding) { + return g_Encodings[iEncoding].pszParseNames; +} +// ============================================================================ + + + + +UINT Encoding_SciGetCodePage(HWND hwnd) { + UNUSED(hwnd); + return CP_UTF8; + // remove internal support for Chinese, Japan, Korean DBCS use UTF-8 instead + /* + int cp = (UINT)SendMessage(hwnd,SCI_GETCODEPAGE,0,0); + if (cp == 932 || cp == 936 || cp == 949 || cp == 950) { + return cp; + } + return (cp == 0) ? CP_ACP : CP_UTF8; + */ +} +// ============================================================================ + + +int Encoding_SciMappedCodePage(int iEncoding) { + UNUSED(iEncoding); + return SC_CP_UTF8; + // remove internal support for Chinese, Japan, Korean DBCS use UTF-8 instead + /* + if (Encoding_IsValid(iEncoding)) { + // check for Chinese, Japan, Korean DBCS code pages and switch accordingly + int cp = (int)g_Encodings[iEncoding].uCodePage; + if (cp == 932 || cp == 936 || cp == 949 || cp == 950) { + return cp; + } + } + */ +} +// ============================================================================ + + +void Encoding_SciSetCodePage(HWND hwnd, int iEncoding) { + int cp = Encoding_SciMappedCodePage(iEncoding); + SendMessage(hwnd, SCI_SETCODEPAGE, (WPARAM)cp, 0); + // charsets can be changed via styles schema + /* + int charset = SC_CHARSET_ANSI; + switch (cp) { + case 932: + charset = SC_CHARSET_SHIFTJIS; + break; + case 936: + charset = SC_CHARSET_GB2312; + break; + case 949: + charset = SC_CHARSET_HANGUL; + break; + case 950: + charset = SC_CHARSET_CHINESEBIG5; + break; + default: + charset = g_iDefaultCharSet; + break; + } + SendMessage(hwnd,SCI_STYLESETCHARACTERSET,(WPARAM)STYLE_DEFAULT,(LPARAM)charset); + */ +} +// ============================================================================ + + +extern BOOL bSkipUnicodeDetection; + +BOOL IsUnicode(const char* pBuffer, int cb, LPBOOL lpbBOM, LPBOOL lpbReverse) { + int i = 0xFFFF; + + BOOL bIsTextUnicode; + + BOOL bHasBOM; + BOOL bHasRBOM; + + if (!pBuffer || cb < 2) + return FALSE; + + if (!bSkipUnicodeDetection) + bIsTextUnicode = IsTextUnicode(pBuffer, cb, &i); + else + bIsTextUnicode = FALSE; + + bHasBOM = (*((UNALIGNED PWCHAR)pBuffer) == 0xFEFF); + bHasRBOM = (*((UNALIGNED PWCHAR)pBuffer) == 0xFFFE); + + if (i == 0xFFFF) // i doesn't seem to have been modified ... + i = 0; + + if (bIsTextUnicode || bHasBOM || bHasRBOM || + ((i & (IS_TEXT_UNICODE_UNICODE_MASK | IS_TEXT_UNICODE_REVERSE_MASK)) && + !((i & IS_TEXT_UNICODE_UNICODE_MASK) && (i & IS_TEXT_UNICODE_REVERSE_MASK)) && + !(i & IS_TEXT_UNICODE_ODD_LENGTH) && + !(i & IS_TEXT_UNICODE_ILLEGAL_CHARS && !(i & IS_TEXT_UNICODE_REVERSE_SIGNATURE)) && + !((i & IS_TEXT_UNICODE_REVERSE_MASK) == IS_TEXT_UNICODE_REVERSE_STATISTICS))) { + + if (lpbBOM) + *lpbBOM = (bHasBOM || bHasRBOM || + (i & (IS_TEXT_UNICODE_SIGNATURE | IS_TEXT_UNICODE_REVERSE_SIGNATURE))) + ? TRUE : FALSE; + + if (lpbReverse) + *lpbReverse = (bHasRBOM || (i & IS_TEXT_UNICODE_REVERSE_MASK)) ? TRUE : FALSE; + + return TRUE; + } + + else + + return FALSE; +} +// ============================================================================ + + +BOOL IsUTF8(const char* pTest, int nLength) +{ + static int byte_class_table[256] = { + /* 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F */ + /* 00 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 10 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 20 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 30 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 40 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 50 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 60 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 80 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* 90 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + /* A0 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + /* B0 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + /* C0 */ 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + /* D0 */ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + /* E0 */ 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 7, 7, + /* F0 */ 9,10,10,10,11, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 + /* 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F */ }; + + /* state table */ + typedef enum { + kSTART = 0, kA, kB, kC, kD, kE, kF, kG, kERROR, kNumOfStates + } utf8_state; + + static utf8_state state_table[] = { + /* kSTART, kA, kB, kC, kD, kE, kF, kG, kERROR */ + /* 0x00-0x7F: 0 */ kSTART, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, + /* 0x80-0x8F: 1 */ kERROR, kSTART, kA, kERROR, kA, kB, kERROR, kB, kERROR, + /* 0x90-0x9f: 2 */ kERROR, kSTART, kA, kERROR, kA, kB, kB, kERROR, kERROR, + /* 0xa0-0xbf: 3 */ kERROR, kSTART, kA, kA, kERROR, kB, kB, kERROR, kERROR, + /* 0xc0-0xc1, 0xf5-0xff: 4 */ kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, + /* 0xc2-0xdf: 5 */ kA, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, + /* 0xe0: 6 */ kC, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, + /* 0xe1-0xec, 0xee-0xef: 7 */ kB, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, + /* 0xed: 8 */ kD, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, + /* 0xf0: 9 */ kF, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, + /* 0xf1-0xf3: 10 */ kE, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, + /* 0xf4: 11 */ kG, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR }; + +#define BYTE_CLASS(b) (byte_class_table[(unsigned char)b]) +#define NEXT_STATE(b,cur) (state_table[(BYTE_CLASS(b) * kNumOfStates) + (cur)]) + + utf8_state current = kSTART; + int i; + + const char* pt = pTest; + int len = nLength; + + for (i = 0; i < len; i++, pt++) { + + current = NEXT_STATE(*pt, current); + if (kERROR == current) + break; + } + + return (current == kSTART) ? TRUE : FALSE; +} +// ============================================================================ + + +BOOL IsUTF7(const char* pTest, int nLength) { + int i; + const char *pt = pTest; + + for (i = 0; i < nLength; i++) { + if (*pt & 0x80 || !*pt) + return FALSE; + pt++; + } + + return TRUE; +} +// ============================================================================ + + +/* byte length of UTF-8 sequence based on value of first byte. +for UTF-16 (21-bit space), max. code length is 4, so we only need to look +at 4 upper bits. +*/ +static const INT utf8_lengths[16] = +{ + 1,1,1,1,1,1,1,1, /* 0000 to 0111 : 1 byte (plain ASCII) */ + 0,0,0,0, /* 1000 to 1011 : not valid */ + 2,2, /* 1100, 1101 : 2 bytes */ + 3, /* 1110 : 3 bytes */ + 4 /* 1111 :4 bytes */ +}; +// ============================================================================ + + +/*++ +Function : +UTF8_mbslen_bytes [INTERNAL] + +Calculates the byte size of a NULL-terminated UTF-8 string. + +Parameters : +char *utf8_string : string to examine + +Return value : +size (in bytes) of a NULL-terminated UTF-8 string. +-1 if invalid NULL-terminated UTF-8 string +--*/ +INT UTF8_mbslen_bytes(LPCSTR utf8_string) +{ + INT length = 0; + INT code_size; + BYTE byte; + + while (*utf8_string) { + byte = (BYTE)*utf8_string; + + if ((byte <= 0xF7) && (0 != (code_size = utf8_lengths[byte >> 4]))) { + length += code_size; + utf8_string += code_size; + } + else { + /* we got an invalid byte value but need to count it, + it will be later ignored during the string conversion */ + //WARN("invalid first byte value 0x%02X in UTF-8 sequence!\n",byte); + length++; + utf8_string++; + } + } + length++; /* include NULL terminator */ + return length; +} +// ============================================================================ + + +/*++ +Function : +UTF8_mbslen [INTERNAL] + +Calculates the character size of a NULL-terminated UTF-8 string. + +Parameters : +char *utf8_string : string to examine +int byte_length : byte size of string + +Return value : +size (in characters) of a UTF-8 string. +-1 if invalid UTF-8 string +--*/ +INT UTF8_mbslen(LPCSTR source, INT byte_length) +{ + INT wchar_length = 0; + INT code_size; + BYTE byte; + + while (byte_length > 0) { + byte = (BYTE)*source; + + /* UTF-16 can't encode 5-byte and 6-byte sequences, so maximum value + for first byte is 11110111. Use lookup table to determine sequence + length based on upper 4 bits of first byte */ + if ((byte <= 0xF7) && (0 != (code_size = utf8_lengths[byte >> 4]))) { + /* 1 sequence == 1 character */ + wchar_length++; + + if (code_size == 4) + wchar_length++; + + source += code_size; /* increment pointer */ + byte_length -= code_size; /* decrement counter*/ + } + else { + /* + unlike UTF8_mbslen_bytes, we ignore the invalid characters. + we only report the number of valid characters we have encountered + to match the Windows behavior. + */ + //WARN("invalid byte 0x%02X in UTF-8 sequence, skipping it!\n", + // byte); + source++; + byte_length--; + } + } + return wchar_length; +} +// ============================================================================ + + + + +/* +* Copyright (C) 2006-2016 Wu Yongwei +* +* This software is provided 'as-is', without any express or implied +* warranty. In no event will the authors be held liable for any +* damages arising from the use of this software. +* +* Permission is granted to anyone to use this software for any purpose, +* including commercial applications, and to alter it and redistribute +* it freely, subject to the following restrictions: +* +* 1. The origin of this software must not be misrepresented; you must +* not claim that you wrote the original software. If you use this +* software in a product, an acknowledgement in the product +* documentation would be appreciated but is not required. +* 2. Altered source versions must be plainly marked as such, and must +* not be misrepresented as being the original software. +* 3. This notice may not be removed or altered from any source +* distribution. +* +* +* The latest version of this software should be available at: +* +* +*/ + +/** +* @file TellEnc.c +* +* Program to detect the encoding of text. It currently supports ASCII, +* UTF-8, UTF-16/32 (little-endian or big-endian), Latin1, Windows-1252, +* CP437, GB2312, GBK, Big5, and SJIS, among others. +* +* @version 1.22, 2016/07/26 +* @author Wu Yongwei +*/ + + +typedef unsigned short uint16_t; +typedef unsigned int uint32_t; + +typedef struct _char_count_t { + uint16_t first; + uint32_t second; +} char_count_t; + +//typedef pair char_count_t; +//typedef map char_count_map_t; +//typedef vector char_count_vec_t; + + +typedef struct _freq_analysis_data_t { + uint16_t dbyte; + const char* enc; +} freq_analysis_data_t; + + + +typedef enum { + UTF8_INVALID, + UTF8_1, + UTF8_2, + UTF8_3, + UTF8_4, + UTF8_TAIL +} UTF8_State; + + +#define MAX_CHAR 256 + +static const unsigned char NON_TEXT_CHARS[] = { 0, 26, 127, 255 }; +static const char NUL = '\0'; +static const char DOS_EOF = '\x1A'; +static const int EVEN = 0; +static const int ODD = 1; + +static UTF8_State utf8_char_table[MAX_CHAR]; + +// ============================================================================ + +static freq_analysis_data_t freq_analysis_data[] = { + { 0x9a74, "windows-1250" }, // "št" (Czech) + { 0xe865, "windows-1250" }, // "če" (Czech) + { 0xf865, "windows-1250" }, // "ře" (Czech) + { 0xe167, "windows-1250" }, // "ág" (Hungarian) + { 0xe96c, "windows-1250" }, // "él" (Hungarian) + { 0xb36f, "windows-1250" }, // "ło" (Polish) + { 0xea7a, "windows-1250" }, // "ęz" (Polish) + { 0xf377, "windows-1250" }, // "ów" (Polish) + { 0x9d20, "windows-1250" }, // "ť " (Slovak) + { 0xfa9d, "windows-1250" }, // "úť" (Slovak) + { 0x9e69, "windows-1250" }, // "ži" (Slovenian) + { 0xe869, "windows-1250" }, // "či" (Slovenian) + { 0xe020, "windows-1252" }, // "à " (French) + { 0xe920, "windows-1252" }, // "é " (French) + { 0xe963, "windows-1252" }, // "éc" (French) + { 0xe965, "windows-1252" }, // "ée" (French) + { 0xe972, "windows-1252" }, // "ér" (French) + { 0xe4e4, "windows-1252" }, // "ää" (Finnish) + { 0xe474, "windows-1252" }, // "ät" (German) + { 0xfc72, "windows-1252" }, // "ür" (German) + { 0xed6e, "windows-1252" }, // "ín" (Spanish) + { 0xf36e, "windows-1252" }, // "ón" (Spanish) + { 0x8220, "cp437" }, // "é " (French) + { 0x8263, "cp437" }, // "éc" (French) + { 0x8265, "cp437" }, // "ée" (French) + { 0x8272, "cp437" }, // "ér" (French) + { 0x8520, "cp437" }, // "à " (French) + { 0x8172, "cp437" }, // "ür" (German) + { 0x8474, "cp437" }, // "ät" (German) + { 0xc4c4, "cp437" }, // "──" + { 0xcdcd, "cp437" }, // "══" + { 0xdbdb, "cp437" }, // "██" + { 0xa1a1, "gbk" }, // " " + { 0xa1a2, "gbk" }, // "、" + { 0xa1a3, "gbk" }, // "。" + { 0xa1a4, "gbk" }, // "·" + { 0xa1b6, "gbk" }, // "《" + { 0xa1b7, "gbk" }, // "》" + { 0xa3ac, "gbk" }, // "," + { 0xa3ba, "gbk" }, // ":" + { 0xb5c4, "gbk" }, // "的" + { 0xc1cb, "gbk" }, // "了" + { 0xd2bb, "gbk" }, // "一" + { 0xcac7, "gbk" }, // "是" + { 0xb2bb, "gbk" }, // "不" + { 0xb8f6, "gbk" }, // "个" + { 0xc8cb, "gbk" }, // "人" + { 0xd5e2, "gbk" }, // "这" + { 0xd3d0, "gbk" }, // "有" + { 0xced2, "gbk" }, // "我" + { 0xc4e3, "gbk" }, // "你" + { 0xcbfb, "gbk" }, // "他" + { 0xcbfd, "gbk" }, // "她" + { 0xc9cf, "gbk" }, // "上" + { 0xbfb4, "gbk" }, // "看" + { 0xd6ae, "gbk" }, // "之" + { 0xbbb9, "gbk" }, // "还" + { 0xbfc9, "gbk" }, // "可" + { 0xbaf3, "gbk" }, // "后" + { 0xd6d0, "gbk" }, // "中" + { 0xd0d0, "gbk" }, // "行" + { 0xb1d2, "gbk" }, // "币" + { 0xb3f6, "gbk" }, // "出" + { 0xb7d1, "gbk" }, // "费" + { 0xb8d0, "gbk" }, // "感" + { 0xbef5, "gbk" }, // "觉" + { 0xc4ea, "gbk" }, // "年" + { 0xd4c2, "gbk" }, // "月" + { 0xc8d5, "gbk" }, // "日" + { 0xa140, "big5" }, // " " + { 0xa141, "big5" }, // "," + { 0xa143, "big5" }, // "。" + { 0xa147, "big5" }, // ":" + { 0xaaba, "big5" }, // "的" + { 0xa446, "big5" }, // "了" + { 0xa440, "big5" }, // "一" + { 0xac4f, "big5" }, // "是" + { 0xa4a3, "big5" }, // "不" + { 0xa448, "big5" }, // "人" + { 0xa7da, "big5" }, // "我" + { 0xa741, "big5" }, // "你" + { 0xa54c, "big5" }, // "他" + { 0xa66f, "big5" }, // "她" + { 0xadd3, "big5" }, // "個" + { 0xa457, "big5" }, // "上" + { 0xa662, "big5" }, // "在" + { 0xbba1, "big5" }, // "說" + { 0xa65e, "big5" }, // "回" + { 0x8140, "sjis" }, // " " + { 0x8141, "sjis" }, // "、" + { 0x8142, "sjis" }, // "。" + { 0x8145, "sjis" }, // "・" + { 0x8146, "sjis" }, // ":" + { 0x815b, "sjis" }, // "ー" + { 0x82b5, "sjis" }, // "し" + { 0x82bd, "sjis" }, // "た" + { 0x82c8, "sjis" }, // "な" + { 0x82c9, "sjis" }, // "に" + { 0x82cc, "sjis" }, // "の" + { 0x82dc, "sjis" }, // "ま" + { 0x82f0, "sjis" }, // "を" + { 0x8367, "sjis" }, // "ト" + { 0x8393, "sjis" }, // "ン" + { 0x89ef, "sjis" }, // "会" + { 0x906c, "sjis" }, // "人" + { 0x9094, "sjis" }, // "数" + { 0x93fa, "sjis" }, // "日" + { 0x95f1, "sjis" }, // "報" + { 0xa1bc, "euc-jp" }, // "ー" + { 0xa4bf, "euc-jp" }, // "た" + { 0xa4ca, "euc-jp" }, // "な" + { 0xa4cb, "euc-jp" }, // "に" + { 0xa4ce, "euc-jp" }, // "の" + { 0xa4de, "euc-jp" }, // "ま" + { 0xa4f2, "euc-jp" }, // "を" + { 0xa5c8, "euc-jp" }, // "ト" + { 0xa5f3, "euc-jp" }, // "ン" + { 0xb2f1, "euc-jp" }, // "会" + { 0xbfcd, "euc-jp" }, // "人" + { 0xbff4, "euc-jp" }, // "数" + { 0xc6fc, "euc-jp" }, // "日" + { 0xcaf3, "euc-jp" }, // "報" + { 0xc0cc, "euc-kr" }, // "이" + { 0xb0fa, "euc-kr" }, // "과" + { 0xb1e2, "euc-kr" }, // "기" + { 0xb4c2, "euc-kr" }, // "는" + { 0xb7ce, "euc-kr" }, // "로" + { 0xb1db, "euc-kr" }, // "글" + { 0xc5e4, "euc-kr" }, // "토" + { 0xc1a4, "euc-kr" }, // "정" + { 0xc920, "koi8-r" }, // "и " + { 0xc7cf, "koi8-r" }, // "го" + { 0xcbcf, "koi8-r" }, // "ко" + { 0xd3cb, "koi8-r" }, // "ск" + { 0xd3d4, "koi8-r" }, // "ст" + { 0xa6a7, "koi8-u" }, // "ії" + { 0xa6ce, "koi8-u" }, // "ін" + { 0xa6d7, "koi8-u" }, // "ів" + { 0xa7ce, "koi8-u" }, // "їн" + { 0xd0cf, "koi8-u" }, // "по" + { 0xd4c9, "koi8-u" }, // "ти" +}; +// ============================================================================ + + +static size_t nul_count_byte[2]; +static size_t nul_count_word[2]; + +static bool is_binary = false; +static bool is_valid_utf8 = true; +static bool is_valid_latin1 = true; +static uint32_t dbyte_cnt = 0; +static uint32_t dbyte_hihi_cnt = 0; + + +// ============================================================================ +// ============================================================================ + + +static inline bool is_non_text(char ch) +{ + for (size_t i = 0; i < sizeof(NON_TEXT_CHARS); ++i) { + if (ch == NON_TEXT_CHARS[i]) { + return true; + } + } + return false; +} +// ============================================================================ + + + +void init_utf8_char_table() +{ + int ch = 0; + utf8_char_table[ch] = UTF8_INVALID; + ++ch; + for (; ch <= 0x7f; ++ch) { + utf8_char_table[ch] = UTF8_1; + } + for (; ch <= 0xbf; ++ch) { + utf8_char_table[ch] = UTF8_TAIL; + } + for (; ch <= 0xc1; ++ch) { + utf8_char_table[ch] = UTF8_INVALID; + } + for (; ch <= 0xdf; ++ch) { + utf8_char_table[ch] = UTF8_2; + } + for (; ch <= 0xef; ++ch) { + utf8_char_table[ch] = UTF8_3; + } + for (; ch <= 0xf4; ++ch) { + utf8_char_table[ch] = UTF8_4; + } + for (; ch <= 0xff; ++ch) { + utf8_char_table[ch] = UTF8_INVALID; + } +} +// ============================================================================ + + + +static void init_sbyte_char_count(char_count_t sbyte_char_cnt[]) +{ + for (size_t i = 0; i < MAX_CHAR; ++i) { + sbyte_char_cnt[i].first = (uint16_t)i; + sbyte_char_cnt[i].second = 0; + } +} +// ============================================================================ + + + + + + +#if FALSE + + +typedef struct _pattern_t { + const char* name; + const char* pattern; + size_t pattern_len; +} pattern_t; + +static const char* check_ucs_bom(const unsigned char* const buffer, const size_t len) +{ + const pattern_t patterns[] = { + { "ucs-4", "\x00\x00\xFE\xFF", 4 }, + { "ucs-4le", "\xFF\xFE\x00\x00", 4 }, + { "utf-8", "\xEF\xBB\xBF", 3 }, + { "utf-16", "\xFE\xFF", 2 }, + { "utf-16le", "\xFF\xFE", 2 }, + { NULL, NULL, 0 } + }; + for (size_t i = 0; patterns[i].name; ++i) { + const pattern_t* item = &(patterns[i]); + if (len >= item->pattern_len && memcmp(buffer, item->pattern, item->pattern_len) == 0) { + return item->name; + } + } + return NULL; +} +// ============================================================================ + + + +static const char* check_freq_dbyte(uint16_t dbyte) +{ + for (size_t i = 0; + i < sizeof freq_analysis_data / sizeof(freq_analysis_data_t); + ++i) { + if (dbyte == freq_analysis_data[i].dbyte) { + return freq_analysis_data[i].enc; + } + } + return NULL; +} +// ============================================================================ + + + +static const char* search_freq_dbytes(const char_count_vec_t* dbyte_char_cnt) +{ + size_t max_comp_idx = 10; + if (max_comp_idx > dbyte_char_cnt->size()) { + max_comp_idx = dbyte_char_cnt->size(); + } + for (size_t i = 0; i < max_comp_idx; ++i) { + const char* enc = check_freq_dbyte(dbyte_char_cnt[i].first); + if (enc) { + return enc; + } + } + return NULL; +} +// ============================================================================ + + + +const char* tellenc(const unsigned char* const buffer, const size_t len) +{ + if (len == 0) { + return "unknown"; + } + + const char* result = check_ucs_bom(buffer, len); + if (result) { + return result; + } + + char_count_t sbyte_char_cnt[MAX_CHAR]; + char_count_map_t dbyte_char_cnt_map; + init_sbyte_char_count(sbyte_char_cnt); + + unsigned char ch; + int last_ch = EOF; + int utf8_state = UTF8_1; + for (size_t i = 0; i < len; ++i) { + ch = buffer[i]; + sbyte_char_cnt[ch].second++; + + // Check for binary data (including UTF-16/32) + if (is_non_text(ch)) { + if (!is_binary && !(ch == DOS_EOF && i == len - 1)) { + is_binary = true; + } + if (ch == NUL) { + // Count for NULs in even- and odd-number bytes + nul_count_byte[i & 1]++; + if (i & 1) { + if (buffer[i - 1] == NUL) { + // Count for NULs in even- and odd-number words + nul_count_word[(i / 2) & 1]++; + } + } + } + } + + // Check for UTF-8 validity + if (is_valid_utf8) { + switch (utf8_char_table[ch]) { + case UTF8_INVALID: + is_valid_utf8 = false; + break; + case UTF8_1: + if (utf8_state != UTF8_1) { + is_valid_utf8 = false; + } + break; + case UTF8_2: + if (utf8_state != UTF8_1) { + is_valid_utf8 = false; + } else { + utf8_state = UTF8_2; + } + break; + case UTF8_3: + if (utf8_state != UTF8_1) { + is_valid_utf8 = false; + } else { + utf8_state = UTF8_3; + } + break; + case UTF8_4: + if (utf8_state != UTF8_1) { + is_valid_utf8 = false; + } else { + utf8_state = UTF8_4; + } + break; + case UTF8_TAIL: + if (utf8_state > UTF8_1) { + utf8_state--; + } else { + is_valid_utf8 = false; + } + break; + } + } + + // Check whether non-Latin1 characters appear + if (is_valid_latin1) { + if (ch >= 0x80 && ch < 0xa0) { + is_valid_latin1 = false; + } + } + + // Construct double-bytes and count + if (last_ch != EOF) { + uint16_t dbyte_char = (last_ch << 8) + ch; + dbyte_char_cnt_map[dbyte_char]++; + dbyte_cnt++; + if (last_ch > 0xa0 && ch > 0xa0) { + dbyte_hihi_cnt++; + } + last_ch = EOF; + } else if (ch >= 0x80) { + last_ch = ch; + } + } + + // Get the character counts in descending order + sort(sbyte_char_cnt, sbyte_char_cnt + MAX_CHAR, greater_char_count()); + + // Get the double-byte counts in descending order + char_count_vec_t dbyte_char_cnt; + for (char_count_map_t::iterator it = dbyte_char_cnt_map.begin(); + it != dbyte_char_cnt_map.end(); ++it) { + dbyte_char_cnt.push_back(*it); + } + sort(dbyte_char_cnt.begin(), + dbyte_char_cnt.end(), + greater_char_count()); + + if (!is_valid_utf8 && is_binary) { + // Heuristics for UTF-16/32 + if (nul_count_byte[EVEN] > 4 && + (nul_count_byte[ODD] == 0 || + nul_count_byte[EVEN] / nul_count_byte[ODD] > 20)) { + return "utf-16"; + } else if (nul_count_byte[ODD] > 4 && + (nul_count_byte[EVEN] == 0 || + nul_count_byte[ODD] / nul_count_byte[EVEN] > 20)) { + return "utf-16le"; + } else if (nul_count_word[EVEN] > 4 && + (nul_count_word[ODD] == 0 || + nul_count_word[EVEN] / nul_count_word[ODD] > 20)) { + return "ucs-4"; // utf-32 is not a built-in encoding for Vim + } else if (nul_count_word[ODD] > 4 && + (nul_count_word[EVEN] == 0 || + nul_count_word[ODD] / nul_count_word[EVEN] > 20)) { + return "ucs-4le"; // utf-32le is not a built-in encoding for Vim + } else { + return "binary"; + } + } else if (dbyte_cnt == 0) { + // No characters outside the scope of ASCII + return "ascii"; + } else if (is_valid_utf8) { + // Only valid UTF-8 sequences + return "utf-8"; + } else if (const char* enc = search_freq_dbytes(dbyte_char_cnt)) { + return enc; + } else if (dbyte_hihi_cnt * 100 / dbyte_cnt < 5) { + // Mostly a low-byte follows a high-byte + return "windows-1252"; + } + return NULL; +} +// ============================================================================ + + +#endif +const char* tellenc(const unsigned char* const buffer, const size_t len) { UNUSED(buffer); UNUSED(len); return NULL; } + + +const char* tellenc_simplify(const char* const buffer, const size_t len) +{ + const char* enc = tellenc((const unsigned char*)buffer, len); + if (enc) { + if (strcmp(enc, "windows-1252") == 0 && is_valid_latin1) { + // Latin1 is subset of Windows-1252 + return "latin1"; + } else if (strcmp(enc, "gbk") == 0 && dbyte_hihi_cnt == dbyte_cnt) { + // Special case for GB2312: no high-byte followed by a low-byte + return "gb2312"; + } + } + return enc; +} +// ============================================================================ + + + +static bool bInitDone = false; + +int GetBufferEncoding(const char* const buffer, const size_t len) +{ + if (!bInitDone) { + init_utf8_char_table(); + bInitDone = true; + } + + const char* enc = tellenc_simplify(buffer, len); + + if (enc) + return 1; + + return 0; // unknown +} +// ============================================================================ diff --git a/src/Encoding.h b/src/Encoding.h new file mode 100644 index 000000000..cd92f7be0 --- /dev/null +++ b/src/Encoding.h @@ -0,0 +1,139 @@ +/****************************************************************************** +* * +* * +* Notepad3 * +* * +* Encoding.h * +* * +* Copyright (C) 2006-2016 Wu Yongwei * +* * +* This software is provided 'as-is', without any express or implied * +* warranty. In no event will the authors be held liable for any * +* damages arising from the use of this software. * +* * +* Permission is granted to anyone to use this software for any purpose, * +* including commercial applications, and to alter it and redistribute * +* it freely, subject to the following restrictions: * +* * +* 1. The origin of this software must not be misrepresented; you must * +* not claim that you wrote the original software. If you use this * +* software in a product, an acknowledgement in the product * +* documentation would be appreciated but is not required. * +* 2. Altered source versions must be plainly marked as such, and must * +* not be misrepresented as being the original software. * +* 3. This notice may not be removed or altered from any source * +* distribution. * +* * +* * +* The latest version of this software should be available at: * +* * +* * +* * +*******************************************************************************/ +#pragma once +#ifndef _NP3_ENCODING_H_ +#define _NP3_ENCODING_H_ + + +//__forceinline void swapi(int* a, int* b) { int t = *a; *a = *b; *b = t; } + + +extern int g_DOSEncoding; + +#define NCP_DEFAULT 1 +#define NCP_UTF8 2 +#define NCP_UTF8_SIGN 4 +#define NCP_UNICODE 8 +#define NCP_UNICODE_REVERSE 16 +#define NCP_UNICODE_BOM 32 +#define NCP_ANSI 64 +#define NCP_OEM 128 +#define NCP_MBCS 256 +#define NCP_INTERNAL (NCP_DEFAULT|NCP_UTF8|NCP_UTF8_SIGN|NCP_UNICODE|NCP_UNICODE_REVERSE|NCP_UNICODE_BOM|NCP_ANSI|NCP_OEM|NCP_MBCS) +#define NCP_EXTERNAL_8BIT 512 +#define NCP_RECODE 1024 + +#define CPI_GET -2 +#define CPI_NONE -1 +#define CPI_ANSI_DEFAULT 0 +#define CPI_OEM 1 +#define CPI_UNICODEBOM 2 +#define CPI_UNICODEBEBOM 3 +#define CPI_UNICODE 4 +#define CPI_UNICODEBE 5 +#define CPI_UTF8 6 +#define CPI_UTF8SIGN 7 +#define CPI_UTF7 8 + +#define IDS_ENCODINGNAME0 61000 +#define IDS_EOLMODENAME0 62000 + +typedef struct _np2encoding { + UINT uFlags; + UINT uCodePage; + char* pszParseNames; + int idsName; + WCHAR wchLabel[64]; +} NP2ENCODING; + +int Encoding_CountOf(); +int Encoding_Current(int); // getter/setter +int Encoding_SrcCmdLn(int); // getter/setter +int Encoding_SrcWeak(int); // getter/setter +BOOL Encoding_HasChanged(int); // query/setter + +void Encoding_InitDefaults(); +int Encoding_MapIniSetting(BOOL, int); +int Encoding_MapUnicode(int); +void Encoding_SetLabel(int); +int Encoding_MatchW(LPCWSTR); +int Encoding_MatchA(char*); +BOOL Encoding_IsValid(int); +int Encoding_GetByCodePage(UINT); +void Encoding_AddToListView(HWND, int, BOOL); +BOOL Encoding_GetFromListView(HWND, int *); +void Encoding_AddToComboboxEx(HWND, int, BOOL); +BOOL Encoding_GetFromComboboxEx(HWND, int *); + +UINT Encoding_GetCodePage(int); + +BOOL Encoding_IsDefault(int); +BOOL Encoding_IsANSI(int); +BOOL Encoding_IsOEM(int); +BOOL Encoding_IsUTF8(int); +BOOL Encoding_IsUTF8_SIGN(int); +BOOL Encoding_IsMBCS(int); +BOOL Encoding_IsUNICODE(int); +BOOL Encoding_IsUNICODE_BOM(int); +BOOL Encoding_IsUNICODE_REVERSE(int); +BOOL Encoding_IsINTERNAL(int); +BOOL Encoding_IsEXTERNAL_8BIT(int); +BOOL Encoding_IsRECODE(int); + +void Encoding_SetDefaultFlag(int); +const WCHAR* Encoding_GetLabel(int); +const char* Encoding_GetParseNames(int); + +// Scintilla related +UINT Encoding_SciGetCodePage(HWND); +int Encoding_SciMappedCodePage(int); +void Encoding_SciSetCodePage(HWND, int); + + +BOOL IsUnicode(const char*, int, LPBOOL, LPBOOL); +BOOL IsUTF8(const char*, int); +BOOL IsUTF7(const char*, int); + +#define IsUTF8Signature(p) ((*(p+0) == '\xEF' && *(p+1) == '\xBB' && *(p+2) == '\xBF')) +#define UTF8StringStart(p) (IsUTF8Signature(p)) ? (p+3) : (p) + +INT UTF8_mbslen_bytes(LPCSTR utf8_string); +INT UTF8_mbslen(LPCSTR source, INT byte_length); + + +int Encoding_GetEncoding(const char* const, const size_t); + + +// -------------------------------------------------------------------------------------------------------------------------------- + +#endif //_NP3_ENCODING_H_ diff --git a/src/Helpers.c b/src/Helpers.c index a78275a89..8bd7796cc 100644 --- a/src/Helpers.c +++ b/src/Helpers.c @@ -29,21 +29,21 @@ #include #include #include -#include -#include #include #include //#include #include "scintilla.h" #include "resource.h" #include "edit.h" +#include "encoding.h" #include "notepad3.h" + #include "helpers.h" +//============================================================================= extern HINSTANCE g_hInstance; - //============================================================================= // // Cut of substrings defined by pattern @@ -2756,881 +2756,6 @@ VOID RestoreWndFromTray(HWND hWnd) } -//============================================================================= -// -// Encoding Helper Functions -// - -int g_DOSEncoding; - -// Supported Encodings -WCHAR wchANSI[16] = { L'\0' }; -WCHAR wchOEM[16] = { L'\0' }; - -NP2ENCODING g_Encodings[] = { - { NCP_ANSI | NCP_RECODE, CP_ACP, "ansi,system,ascii,", 61000, L"" }, - { NCP_OEM | NCP_RECODE, CP_OEMCP, "oem,oem,", 61001, L"" }, - { NCP_UNICODE | NCP_UNICODE_BOM, CP_UTF8, "", 61002, L"" }, - { NCP_UNICODE | NCP_UNICODE_REVERSE | NCP_UNICODE_BOM, CP_UTF8, "", 61003, L"" }, - { NCP_UNICODE | NCP_RECODE, CP_UTF8, "utf-16,utf16,unicode,", 61004, L"" }, - { NCP_UNICODE | NCP_UNICODE_REVERSE | NCP_RECODE, CP_UTF8, "utf-16be,utf16be,unicodebe,", 61005, L"" }, - { NCP_UTF8 | NCP_RECODE, CP_UTF8, "utf-8,utf8,", 61006, L"" }, - { NCP_UTF8 | NCP_UTF8_SIGN, CP_UTF8, "utf-8,utf8,", 61007, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, CP_UTF7, "utf-7,utf7,", 61008, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 720, "DOS-720,dos720,", 61009, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 28596, "iso-8859-6,iso88596,arabic,csisolatinarabic,ecma114,isoir127,", 61010, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 10004, "x-mac-arabic,xmacarabic,", 61011, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 1256, "windows-1256,windows1256,cp1256", 61012, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 775, "ibm775,ibm775,cp500,", 61013, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 28594, "iso-8859-4,iso88594,csisolatin4,isoir110,l4,latin4,", 61014, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 1257, "windows-1257,windows1257,", 61015, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 852, "ibm852,ibm852,cp852,", 61016, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 28592, "iso-8859-2,iso88592,csisolatin2,isoir101,latin2,l2,", 61017, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 10029, "x-mac-ce,xmacce,", 61018, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 1250, "windows-1250,windows1250,xcp1250,", 61019, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 936, "gb2312,gb2312,chinese,cngb,csgb2312,csgb231280,gb231280,gbk,", 61020, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 10008, "x-mac-chinesesimp,xmacchinesesimp,", 61021, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 950, "big5,big5,cnbig5,csbig5,xxbig5,", 61022, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 10002, "x-mac-chinesetrad,xmacchinesetrad,", 61023, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 10082, "x-mac-croatian,xmaccroatian,", 61024, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 866, "cp866,cp866,ibm866,", 61025, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 28595, "iso-8859-5,iso88595,csisolatin5,csisolatincyrillic,cyrillic,isoir144,", 61026, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 20866, "koi8-r,koi8r,cskoi8r,koi,koi8,", 61027, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 21866, "koi8-u,koi8u,koi8ru,", 61028, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 10007, "x-mac-cyrillic,xmaccyrillic,", 61029, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 1251, "windows-1251,windows1251,xcp1251,", 61030, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 28603, "iso-8859-13,iso885913,", 61031, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 863, "ibm863,ibm863,", 61032, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 737, "ibm737,ibm737,", 61033, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 28597, "iso-8859-7,iso88597,csisolatingreek,ecma118,elot928,greek,greek8,isoir126,", 61034, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 10006, "x-mac-greek,xmacgreek,", 61035, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 1253, "windows-1253,windows1253,", 61036, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 869, "ibm869,ibm869,", 61037, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 862, "DOS-862,dos862,", 61038, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 38598, "iso-8859-8-i,iso88598i,logical,", 61039, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 28598, "iso-8859-8,iso88598,csisolatinhebrew,hebrew,isoir138,visual,", 61040, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 10005, "x-mac-hebrew,xmachebrew,", 61041, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 1255, "windows-1255,windows1255,", 61042, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 861, "ibm861,ibm861,", 61043, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 10079, "x-mac-icelandic,xmacicelandic,", 61044, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 10001, "x-mac-japanese,xmacjapanese,", 61045, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 932, "shift_jis,shiftjis,shiftjs,csshiftjis,cswindows31j,mskanji,xmscp932,xsjis,", 61046, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 10003, "x-mac-korean,xmackorean,", 61047, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 949, "windows-949,windows949,ksc56011987,csksc5601,euckr,isoir149,korean,ksc56011989", 61048, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 28593, "iso-8859-3,iso88593,latin3,isoir109,l3,", 61049, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 28605, "iso-8859-15,iso885915,latin9,l9,", 61050, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 865, "ibm865,ibm865,", 61051, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 437, "ibm437,ibm437,437,cp437,cspc8,codepage437,", 61052, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 858, "ibm858,ibm858,ibm00858,", 61053, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 860, "ibm860,ibm860,", 61054, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 10010, "x-mac-romanian,xmacromanian,", 61055, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 10021, "x-mac-thai,xmacthai,", 61056, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 874, "windows-874,windows874,dos874,iso885911,tis620,", 61057, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 857, "ibm857,ibm857,", 61058, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 28599, "iso-8859-9,iso88599,latin5,isoir148,l5,", 61059, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 10081, "x-mac-turkish,xmacturkish,", 61060, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 1254, "windows-1254,windows1254,", 61061, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 10017, "x-mac-ukrainian,xmacukrainian,", 61062, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 1258, "windows-1258,windows-258,", 61063, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 850, "ibm850,ibm850,", 61064, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 28591, "iso-8859-1,iso88591,cp819,latin1,ibm819,isoir100,latin1,l1,", 61065, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 10000, "macintosh,macintosh,", 61066, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 1252, "windows-1252,windows1252,cp367,cp819,ibm367,us,xansi,", 61067, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 37, "ebcdic-cp-us,ebcdiccpus,ebcdiccpca,ebcdiccpwt,ebcdiccpnl,ibm037,cp037,", 61068, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 500, "x-ebcdic-international,xebcdicinternational,", 61069, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 875, "x-EBCDIC-GreekModern,xebcdicgreekmodern,", 61070, L"" }, - { NCP_EXTERNAL_8BIT | NCP_RECODE, 1026, "CP1026,cp1026,csibm1026,ibm1026,", 61071, L"" }, - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 870, "CP870,cp870,ebcdiccproece,ebcdiccpyu,csibm870,ibm870,", 00000, L"" }, // IBM EBCDIC (Multilingual Latin-2) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1047, "IBM01047,ibm01047,", 00000, L"" }, // IBM EBCDIC (Open System Latin-1) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1140, "x-ebcdic-cp-us-euro,xebcdiccpuseuro,", 00000, L"" }, // IBM EBCDIC (US-Canada-Euro) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1141, "x-ebcdic-germany-euro,xebcdicgermanyeuro,", 00000, L"" }, // IBM EBCDIC (Germany-Euro) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1142, "x-ebcdic-denmarknorway-euro,xebcdicdenmarknorwayeuro,", 00000, L"" }, // IBM EBCDIC (Denmark-Norway-Euro) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1143, "x-ebcdic-finlandsweden-euro,xebcdicfinlandswedeneuro,", 00000, L"" }, // IBM EBCDIC (Finland-Sweden-Euro) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1144, "x-ebcdic-italy-euro,xebcdicitalyeuro,", 00000, L"" }, // IBM EBCDIC (Italy-Euro) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1145, "x-ebcdic-spain-euro,xebcdicspaineuro,", 00000, L"" }, // IBM EBCDIC (Spain-Latin America-Euro) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1146, "x-ebcdic-uk-euro,xebcdicukeuro,", 00000, L"" }, // IBM EBCDIC (UK-Euro) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1147, "x-ebcdic-france-euro,xebcdicfranceeuro,", 00000, L"" }, // IBM EBCDIC (France-Euro) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1148, "x-ebcdic-international-euro,xebcdicinternationaleuro,", 00000, L"" }, // IBM EBCDIC (International-Euro) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1149, "x-ebcdic-icelandic-euro,xebcdicicelandiceuro,", 00000, L"" }, // IBM EBCDIC (Icelandic-Euro) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1361, "johab,johab,", 00000, L"" }, // Korean (Johab) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20273, "x-EBCDIC-Germany,xebcdicgermany,", 00000, L"" }, // IBM EBCDIC (Germany) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20277, "x-EBCDIC-DenmarkNorway,xebcdicdenmarknorway,ebcdiccpdk,ebcdiccpno,", 00000, L"" }, // IBM EBCDIC (Denmark-Norway) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20278, "x-EBCDIC-FinlandSweden,xebcdicfinlandsweden,ebcdicpfi,ebcdiccpse,", 00000, L"" }, // IBM EBCDIC (Finland-Sweden) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20280, "x-EBCDIC-Italy,xebcdicitaly,", 00000, L"" }, // IBM EBCDIC (Italy) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20284, "x-EBCDIC-Spain,xebcdicspain,ebcdiccpes,", 00000, L"" }, // IBM EBCDIC (Spain-Latin America) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20285, "x-EBCDIC-UK,xebcdicuk,ebcdiccpgb,", 00000, L"" }, // IBM EBCDIC (UK) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20290, "x-EBCDIC-JapaneseKatakana,xebcdicjapanesekatakana,", 00000, L"" }, // IBM EBCDIC (Japanese Katakana) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20297, "x-EBCDIC-France,xebcdicfrance,ebcdiccpfr,", 00000, L"" }, // IBM EBCDIC (France) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20420, "x-EBCDIC-Arabic,xebcdicarabic,ebcdiccpar1,", 00000, L"" }, // IBM EBCDIC (Arabic) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20423, "x-EBCDIC-Greek,xebcdicgreek,ebcdiccpgr,", 00000, L"" }, // IBM EBCDIC (Greek) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20424, "x-EBCDIC-Hebrew,xebcdichebrew,ebcdiccphe,", 00000, L"" }, // IBM EBCDIC (Hebrew) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20833, "x-EBCDIC-KoreanExtended,xebcdickoreanextended,", 00000, L"" }, // IBM EBCDIC (Korean Extended) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20838, "x-EBCDIC-Thai,xebcdicthai,ibmthai,csibmthai,", 00000, L"" }, // IBM EBCDIC (Thai) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20871, "x-EBCDIC-Icelandic,xebcdicicelandic,ebcdiccpis,", 00000, L"" }, // IBM EBCDIC (Icelandic) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20880, "x-EBCDIC-CyrillicRussian,xebcdiccyrillicrussian,ebcdiccyrillic,", 00000, L"" }, // IBM EBCDIC (Cyrillic Russian) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20905, "x-EBCDIC-Turkish,xebcdicturkish,ebcdiccptr,", 00000, L"" }, // IBM EBCDIC (Turkish) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20924, "IBM00924,ibm00924,ebcdiclatin9euro,", 00000, L"" }, // IBM EBCDIC (Open System-Euro Latin-1) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 21025, "x-EBCDIC-CyrillicSerbianBulgarian,xebcdiccyrillicserbianbulgarian,", 00000, L"" }, // IBM EBCDIC (Cyrillic Serbian-Bulgarian) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50930, "x-EBCDIC-JapaneseAndKana,xebcdicjapaneseandkana,", 00000, L"" }, // IBM EBCDIC (Japanese and Japanese Katakana) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50931, "x-EBCDIC-JapaneseAndUSCanada,xebcdicjapaneseanduscanada,", 00000, L"" }, // IBM EBCDIC (Japanese and US-Canada) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50933, "x-EBCDIC-KoreanAndKoreanExtended,xebcdickoreanandkoreanextended,", 00000, L"" }, // IBM EBCDIC (Korean and Korean Extended) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50935, "x-EBCDIC-SimplifiedChinese,xebcdicsimplifiedchinese,", 00000, L"" }, // IBM EBCDIC (Chinese Simplified) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50937, "x-EBCDIC-TraditionalChinese,xebcdictraditionalchinese,", 00000, L"" }, // IBM EBCDIC (Chinese Traditional) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50939, "x-EBCDIC-JapaneseAndJapaneseLatin,xebcdicjapaneseandjapaneselatin,", 00000, L"" }, // IBM EBCDIC (Japanese and Japanese-Latin) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20105, "x-IA5,xia5,", 00000, L"" }, // Western European (IA5) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20106, "x-IA5-German,xia5german,", 00000, L"" }, // German (IA5) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20107, "x-IA5-Swedish,xia5swedish,", 00000, L"" }, // Swedish (IA5) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20108, "x-IA5-Norwegian,xia5norwegian,", 00000, L"" }, // Norwegian (IA5) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20936, "x-cp20936,xcp20936,", 00000, L"" }, // Chinese Simplified (GB2312) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20932, "euc-jp,,", 00000, L"" }, // Japanese (JIS X 0208-1990 & 0212-1990) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50220, "iso-2022-jp,iso2022jp,", 00000, L"" }, // Japanese (JIS) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50221, "csISO2022JP,csiso2022jp,", 00000, L"" }, // Japanese (JIS-Allow 1 byte Kana) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50222, "_iso-2022-jp$SIO,iso2022jpSIO,", 00000, L"" }, // Japanese (JIS-Allow 1 byte Kana - SO/SI) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50225, "iso-2022-kr,iso2022kr,csiso2022kr,", 00000, L"" }, // Korean (ISO-2022-KR) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50227, "x-cp50227,xcp50227,", 00000, L"" }, // Chinese Simplified (ISO-2022) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50229, "iso-2022-cn,iso2022cn,", 00000, L"" }, // Chinese Traditional (ISO-2022) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20000, "x-Chinese-CNS,xchinesecns,", 00000, L"" }, // Chinese Traditional (CNS) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20002, "x-Chinese-Eten,xchineseeten,", 00000, L"" }, // Chinese Traditional (Eten) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 51932, "euc-jp,eucjp,xeuc,xeucjp,", 00000, L"" }, // Japanese (EUC) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 51936, "euc-cn,euccn,xeuccn,", 00000, L"" }, // Chinese Simplified (EUC) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 51949, "euc-kr,euckr,cseuckr,", 00000, L"" }, // Korean (EUC) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 52936, "hz-gb-2312,hzgb2312,hz,", 00000, L"" }, // Chinese Simplified (HZ-GB2312) - { NCP_EXTERNAL_8BIT | NCP_RECODE, 54936, "gb18030,gb18030,", 61072, L"" } // Chinese Simplified (GB18030) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57002, "x-iscii-de,xisciide,", 00000, L"" }, // ISCII Devanagari - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57003, "x-iscii-be,xisciibe,", 00000, L"" }, // ISCII Bengali - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57004, "x-iscii-ta,xisciita,", 00000, L"" }, // ISCII Tamil - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57005, "x-iscii-te,xisciite,", 00000, L"" }, // ISCII Telugu - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57006, "x-iscii-as,xisciias,", 00000, L"" }, // ISCII Assamese - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57007, "x-iscii-or,xisciior,", 00000, L"" }, // ISCII Oriya - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57008, "x-iscii-ka,xisciika,", 00000, L"" }, // ISCII Kannada - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57009, "x-iscii-ma,xisciima,", 00000, L"" }, // ISCII Malayalam - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57010, "x-iscii-gu,xisciigu,", 00000, L"" }, // ISCII Gujarathi - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57011, "x-iscii-pa,xisciipa,", 00000, L"" }, // ISCII Panjabi -}; - -int Encoding_CountOf() { - return COUNTOF(g_Encodings); -} - -int Encoding_Current(int iEncoding) { - static int CurrentEncoding = CPI_NONE; - - if (iEncoding >= 0) { - if (Encoding_IsValid(iEncoding)) - CurrentEncoding = iEncoding; - else - CurrentEncoding = CPI_UTF8; - } - return CurrentEncoding; -} - - -int Encoding_SrcCmdLn(int iSrcEncoding) { - static int SourceEncoding = CPI_NONE; - - if (iSrcEncoding >= 0) { - if (Encoding_IsValid(iSrcEncoding)) - SourceEncoding = iSrcEncoding; - else - SourceEncoding = CPI_UTF8; - } - else if (iSrcEncoding == CPI_NONE) { - SourceEncoding = CPI_NONE; - } - return SourceEncoding; -} - - -int Encoding_SrcWeak(int iSrcWeakEnc) { - static int SourceWeakEncoding = CPI_NONE; - - if (iSrcWeakEnc >= 0) { - if (Encoding_IsValid(iSrcWeakEnc)) - SourceWeakEncoding = iSrcWeakEnc; - else - SourceWeakEncoding = CPI_ANSI_DEFAULT; - } - else if (iSrcWeakEnc == CPI_NONE) { - SourceWeakEncoding = CPI_NONE; - } - return SourceWeakEncoding; -} - - -BOOL Encoding_HasChanged(int iOriginalEncoding) { - static int OriginalEncoding = CPI_NONE; - - if (iOriginalEncoding >= CPI_NONE) { - OriginalEncoding = iOriginalEncoding; - } - return (BOOL)(OriginalEncoding != Encoding_Current(CPI_GET)); -} - - -void Encoding_InitDefaults() { - const UINT uCodePageMBCS[20] = { - 42, // (Symbol) - 50220,50221,50222,50225,50227,50229, // (Chinese, Japanese, Korean) - 54936, // (GB18030) - 57002,57003,57004,57005,57006,57007,57008,57009,57010,57011, // (ISCII) - 65000, // (UTF-7) - 65001 // (UTF-8) - }; - - g_Encodings[CPI_ANSI_DEFAULT].uCodePage = GetACP(); // set ANSI system CP - StringCchPrintf(wchANSI,COUNTOF(wchANSI),L" (CP-%u)",g_Encodings[CPI_ANSI_DEFAULT].uCodePage); - - for (int i = CPI_UTF7 + 1; i < COUNTOF(g_Encodings); ++i) { - if (Encoding_IsValid(i) && (g_Encodings[i].uCodePage == g_Encodings[CPI_ANSI_DEFAULT].uCodePage)) { - g_Encodings[i].uFlags |= NCP_ANSI; - if (g_Encodings[i].uFlags & NCP_EXTERNAL_8BIT) - g_Encodings[CPI_ANSI_DEFAULT].uFlags |= NCP_EXTERNAL_8BIT; - break; - } - } - - g_Encodings[CPI_OEM].uCodePage = GetOEMCP(); - StringCchPrintf(wchOEM,COUNTOF(wchOEM),L" (CP-%u)",g_Encodings[CPI_OEM].uCodePage); - - for (int i = CPI_UTF7 + 1; i < COUNTOF(g_Encodings); ++i) { - if (Encoding_IsValid(i) && (g_Encodings[i].uCodePage == g_Encodings[CPI_OEM].uCodePage)) { - g_Encodings[i].uFlags |= NCP_OEM; - if (g_Encodings[i].uFlags & NCP_EXTERNAL_8BIT) - g_Encodings[CPI_OEM].uFlags |= NCP_EXTERNAL_8BIT; - break; - } - } - - // multi byte character sets - for (int i = 0; i < COUNTOF(g_Encodings); ++i) { - for (int k = 0; k < COUNTOF(uCodePageMBCS); k++) { - if (g_Encodings[i].uCodePage == uCodePageMBCS[k]) { - g_Encodings[i].uFlags |= NCP_MBCS; - } - } - } - - g_DOSEncoding = CPI_OEM; - // Try to set the DOS encoding to DOS-437 if the default OEMCP is not DOS-437 - if (g_Encodings[g_DOSEncoding].uCodePage != 437) { - for (int i = CPI_UTF7 + 1; i < COUNTOF(g_Encodings); ++i) { - if (Encoding_IsValid(i) && (g_Encodings[i].uCodePage == 437)) { - g_DOSEncoding = i; - break; - } - } - } -} - - -int Encoding_MapIniSetting(BOOL bLoad,int iSetting) { - if (bLoad) { - switch (iSetting) { - case -1: return CPI_NONE; - case 0: return CPI_ANSI_DEFAULT; - case 1: return CPI_UNICODEBOM; - case 2: return CPI_UNICODEBEBOM; - case 3: return CPI_UTF8; - case 4: return CPI_UTF8SIGN; - case 5: return CPI_OEM; - case 6: return CPI_UNICODE; - case 7: return CPI_UNICODEBE; - case 8: return CPI_UTF7; - default: { - for (int i = CPI_UTF7 + 1; i < COUNTOF(g_Encodings); i++) { - if ((g_Encodings[i].uCodePage == (UINT)iSetting) && Encoding_IsValid(i)) - return(i); - } - return CPI_ANSI_DEFAULT; - } - } - } - else { - switch (iSetting) { - case CPI_NONE: return -1; - case CPI_ANSI_DEFAULT: return 0; - case CPI_UNICODEBOM: return 1; - case CPI_UNICODEBEBOM: return 2; - case CPI_UTF8: return 3; - case CPI_UTF8SIGN: return 4; - case CPI_OEM: return 5; - case CPI_UNICODE: return 6; - case CPI_UNICODEBE: return 7; - case CPI_UTF7: return 8; - default: { - if (Encoding_IsValid(iSetting)) - return(g_Encodings[iSetting].uCodePage); - else - return CPI_ANSI_DEFAULT; - } - } - } -} - -int Encoding_MapUnicode(int iUni) { - - if (iUni == CPI_UNICODEBOM) - return CPI_UNICODE; - else if (iUni == CPI_UNICODEBEBOM) - return CPI_UNICODEBE; - else if (iUni == CPI_UTF8SIGN) - return CPI_UTF8; - else - return iUni; -} - -void Encoding_SetLabel(int iEncoding) { - if (g_Encodings[iEncoding].wchLabel[0] == L'\0') { - WCHAR wch1[128] = { L'\0' }; - WCHAR wch2[128] = { L'\0' }; - GetString(g_Encodings[iEncoding].idsName,wch1,COUNTOF(wch1)); - WCHAR *pwsz = StrChr(wch1,L';'); - if (pwsz) { - pwsz = StrChr(CharNext(pwsz),L';'); - if (pwsz) { - pwsz = CharNext(pwsz); - } - } - if (!pwsz) - pwsz = wch1; - - StringCchCopyN(wch2,COUNTOF(wch2),pwsz,COUNTOF(wch1)); - - if (Encoding_IsANSI(iEncoding)) - StringCchCatN(wch2,COUNTOF(wch2),wchANSI,COUNTOF(wchANSI)); - else if (Encoding_IsOEM(iEncoding)) - StringCchCatN(wch2,COUNTOF(wch2),wchOEM,COUNTOF(wchOEM)); - - StringCchCopyN(g_Encodings[iEncoding].wchLabel,COUNTOF(g_Encodings[iEncoding].wchLabel), - wch2,COUNTOF(g_Encodings[iEncoding].wchLabel)); - } -} - - -int Encoding_MatchW(LPCWSTR pwszTest) { - char tchTest[256] = { '\0' }; - WideCharToMultiByteStrg(CP_ACP,pwszTest,tchTest); - return(Encoding_MatchA(tchTest)); -} - - -int Encoding_MatchA(char *pchTest) { - char chTest[256] = { '\0' }; - char *pchSrc = pchTest; - char *pchDst = chTest; - *pchDst++ = ','; - while (*pchSrc) { - if (IsCharAlphaNumericA(*pchSrc)) - *pchDst++ = *CharLowerA(pchSrc); - pchSrc++; - } - *pchDst++ = ','; - *pchDst = 0; - for (int i = 0; i < COUNTOF(g_Encodings); i++) { - if (StrStrIA(g_Encodings[i].pszParseNames,chTest)) { - CPINFO cpi; - if ((g_Encodings[i].uFlags & NCP_INTERNAL) || - IsValidCodePage(g_Encodings[i].uCodePage) && - GetCPInfo(g_Encodings[i].uCodePage,&cpi)) - return(i); - else - return(-1); - } - } - return(-1); -} - - -int Encoding_GetByCodePage(UINT cp) { - for (int i = 0; i < COUNTOF(g_Encodings); i++) { - if (cp == g_Encodings[i].uCodePage) { - return i; - } - } - return CPI_ANSI_DEFAULT; -} - - -BOOL Encoding_IsValid(int iTestEncoding) { - CPINFO cpi; - if ((iTestEncoding >= 0) && (iTestEncoding < COUNTOF(g_Encodings))) { - if ((g_Encodings[iTestEncoding].uFlags & NCP_INTERNAL) || - IsValidCodePage(g_Encodings[iTestEncoding].uCodePage) && - GetCPInfo(g_Encodings[iTestEncoding].uCodePage,&cpi)) { - return(TRUE); - } - } - return(FALSE); -} - - -typedef struct _ee { - int id; - WCHAR wch[256]; -} ENCODINGENTRY,*PENCODINGENTRY; - -int CmpEncoding(const void *s1,const void *s2) { - return StrCmp(((PENCODINGENTRY)s1)->wch,((PENCODINGENTRY)s2)->wch); -} - -void Encoding_AddToListView(HWND hwnd,int idSel,BOOL bRecodeOnly) { - int i; - int iSelItem = -1; - LVITEM lvi; - WCHAR wchBuf[256] = { L'\0' }; - - PENCODINGENTRY pEE = LocalAlloc(LPTR,COUNTOF(g_Encodings) * sizeof(ENCODINGENTRY)); - for (i = 0; i < COUNTOF(g_Encodings); i++) { - pEE[i].id = i; - GetString(g_Encodings[i].idsName,pEE[i].wch,COUNTOF(pEE[i].wch)); - } - qsort(pEE,COUNTOF(g_Encodings),sizeof(ENCODINGENTRY),CmpEncoding); - - ZeroMemory(&lvi,sizeof(LVITEM)); - lvi.mask = LVIF_PARAM | LVIF_TEXT | LVIF_IMAGE; - lvi.pszText = wchBuf; - - for (i = 0; i < COUNTOF(g_Encodings); i++) { - - int id = pEE[i].id; - if (!bRecodeOnly || (g_Encodings[id].uFlags & NCP_RECODE)) { - - lvi.iItem = ListView_GetItemCount(hwnd); - - WCHAR *pwsz = StrChr(pEE[i].wch,L';'); - if (pwsz) { - StringCchCopyN(wchBuf,COUNTOF(wchBuf),CharNext(pwsz),COUNTOF(wchBuf)); - pwsz = StrChr(wchBuf,L';'); - if (pwsz) - *pwsz = 0; - } - else - StringCchCopyN(wchBuf,COUNTOF(wchBuf),pEE[i].wch,COUNTOF(wchBuf)); - - if (Encoding_IsANSI(id)) - StringCchCatN(wchBuf,COUNTOF(wchBuf),wchANSI,COUNTOF(wchANSI)); - else if (Encoding_IsOEM(id)) - StringCchCatN(wchBuf,COUNTOF(wchBuf),wchOEM,COUNTOF(wchOEM)); - - if (Encoding_IsValid(id)) - lvi.iImage = 0; - else - lvi.iImage = 1; - - lvi.lParam = (LPARAM)id; - ListView_InsertItem(hwnd,&lvi); - - if (idSel == id) - iSelItem = lvi.iItem; - } - } - - LocalFree(pEE); - - if (iSelItem != -1) { - ListView_SetItemState(hwnd,iSelItem,LVIS_SELECTED | LVIS_FOCUSED,LVIS_SELECTED | LVIS_FOCUSED); - ListView_EnsureVisible(hwnd,iSelItem,FALSE); - } - else { - ListView_SetItemState(hwnd,0,LVIS_FOCUSED,LVIS_FOCUSED); - ListView_EnsureVisible(hwnd,0,FALSE); - } -} - - -BOOL Encoding_GetFromListView(HWND hwnd,int *pidEncoding) { - LVITEM lvi; - - lvi.iItem = ListView_GetNextItem(hwnd,-1,LVNI_ALL | LVNI_SELECTED); - lvi.iSubItem = 0; - lvi.mask = LVIF_PARAM; - - if (ListView_GetItem(hwnd,&lvi)) { - if (Encoding_IsValid((int)lvi.lParam)) - *pidEncoding = (int)lvi.lParam; - else - *pidEncoding = -1; - - return (TRUE); - } - return(FALSE); -} - - -void Encoding_AddToComboboxEx(HWND hwnd,int idSel,BOOL bRecodeOnly) { - int i; - int iSelItem = -1; - COMBOBOXEXITEM cbei; - WCHAR wchBuf[256] = { L'\0' }; - - PENCODINGENTRY pEE = LocalAlloc(LPTR,COUNTOF(g_Encodings) * sizeof(ENCODINGENTRY)); - for (i = 0; i < COUNTOF(g_Encodings); i++) { - pEE[i].id = i; - GetString(g_Encodings[i].idsName,pEE[i].wch,COUNTOF(pEE[i].wch)); - } - qsort(pEE,COUNTOF(g_Encodings),sizeof(ENCODINGENTRY),CmpEncoding); - - ZeroMemory(&cbei,sizeof(COMBOBOXEXITEM)); - cbei.mask = CBEIF_TEXT | CBEIF_IMAGE | CBEIF_SELECTEDIMAGE | CBEIF_LPARAM; - cbei.pszText = wchBuf; - cbei.cchTextMax = COUNTOF(wchBuf); - cbei.iImage = 0; - cbei.iSelectedImage = 0; - - for (i = 0; i < COUNTOF(g_Encodings); i++) { - - int id = pEE[i].id; - if (!bRecodeOnly || (g_Encodings[id].uFlags & NCP_RECODE)) { - - cbei.iItem = SendMessage(hwnd,CB_GETCOUNT,0,0); - - WCHAR *pwsz = StrChr(pEE[i].wch,L';'); - if (pwsz) { - StringCchCopyN(wchBuf,COUNTOF(wchBuf),CharNext(pwsz),COUNTOF(wchBuf)); - pwsz = StrChr(wchBuf,L';'); - if (pwsz) - *pwsz = 0; - } - else - StringCchCopyN(wchBuf,COUNTOF(wchBuf),pEE[i].wch,COUNTOF(wchBuf)); - - if (Encoding_IsANSI(id)) - StringCchCatN(wchBuf,COUNTOF(wchBuf),wchANSI,COUNTOF(wchANSI)); - else if (id == CPI_OEM) - StringCchCatN(wchBuf,COUNTOF(wchBuf),wchOEM,COUNTOF(wchOEM)); - - cbei.iImage = (Encoding_IsValid(id) ? 0 : 1); - - cbei.lParam = (LPARAM)id; - SendMessage(hwnd,CBEM_INSERTITEM,0,(LPARAM)&cbei); - - if (idSel == id) - iSelItem = (int)cbei.iItem; - } - } - - LocalFree(pEE); - - if (iSelItem != -1) - SendMessage(hwnd,CB_SETCURSEL,(WPARAM)iSelItem,0); -} - - -BOOL Encoding_GetFromComboboxEx(HWND hwnd,int *pidEncoding) { - COMBOBOXEXITEM cbei; - - cbei.iItem = SendMessage(hwnd,CB_GETCURSEL,0,0); - cbei.mask = CBEIF_LPARAM; - - if (SendMessage(hwnd,CBEM_GETITEM,0,(LPARAM)&cbei)) { - if (Encoding_IsValid((int)cbei.lParam)) - *pidEncoding = (int)cbei.lParam; - else - *pidEncoding = -1; - - return (TRUE); - } - return(FALSE); -} - - - - -BOOL Encoding_IsDefault(int iEncoding) { - return (g_Encodings[iEncoding].uFlags & NCP_DEFAULT); -} - -BOOL Encoding_IsANSI(int iEncoding) { - return (g_Encodings[iEncoding].uFlags & NCP_ANSI); -} - -BOOL Encoding_IsOEM(int iEncoding) { - return (g_Encodings[iEncoding].uFlags & NCP_OEM); -} - -UINT Encoding_SciGetCodePage(HWND hwnd) { - UNUSED(hwnd); - return CP_UTF8; - // remove internal support for Chinese, Japan, Korean DBCS use UTF-8 instead - /* - int cp = (UINT)SendMessage(hwnd,SCI_GETCODEPAGE,0,0); - if (cp == 932 || cp == 936 || cp == 949 || cp == 950) { - return cp; - } - return (cp == 0) ? CP_ACP : CP_UTF8; - */ -} - - -int Encoding_SciMappedCodePage(int iEncoding) { - UNUSED(iEncoding); - return SC_CP_UTF8; - // remove internal support for Chinese, Japan, Korean DBCS use UTF-8 instead - /* - if (Encoding_IsValid(iEncoding)) { - // check for Chinese, Japan, Korean DBCS code pages and switch accordingly - int cp = (int)g_Encodings[iEncoding].uCodePage; - if (cp == 932 || cp == 936 || cp == 949 || cp == 950) { - return cp; - } - } - */ -} - - -void Encoding_SciSetCodePage(HWND hwnd,int iEncoding) { - int cp = Encoding_SciMappedCodePage(iEncoding); - SendMessage(hwnd,SCI_SETCODEPAGE,(WPARAM)cp,0); - // charsets can be changed via styles schema - /* - int charset = SC_CHARSET_ANSI; - switch (cp) { - case 932: - charset = SC_CHARSET_SHIFTJIS; - break; - case 936: - charset = SC_CHARSET_GB2312; - break; - case 949: - charset = SC_CHARSET_HANGUL; - break; - case 950: - charset = SC_CHARSET_CHINESEBIG5; - break; - default: - charset = g_iDefaultCharSet; - break; - } - SendMessage(hwnd,SCI_STYLESETCHARACTERSET,(WPARAM)STYLE_DEFAULT,(LPARAM)charset); - */ -} - -extern BOOL bSkipUnicodeDetection; - -BOOL IsUnicode(const char* pBuffer,int cb,LPBOOL lpbBOM,LPBOOL lpbReverse) { - int i = 0xFFFF; - - BOOL bIsTextUnicode; - - BOOL bHasBOM; - BOOL bHasRBOM; - - if (!pBuffer || cb < 2) - return FALSE; - - if (!bSkipUnicodeDetection) - bIsTextUnicode = IsTextUnicode(pBuffer,cb,&i); - else - bIsTextUnicode = FALSE; - - bHasBOM = (*((UNALIGNED PWCHAR)pBuffer) == 0xFEFF); - bHasRBOM = (*((UNALIGNED PWCHAR)pBuffer) == 0xFFFE); - - if (i == 0xFFFF) // i doesn't seem to have been modified ... - i = 0; - - if (bIsTextUnicode || bHasBOM || bHasRBOM || - ((i & (IS_TEXT_UNICODE_UNICODE_MASK | IS_TEXT_UNICODE_REVERSE_MASK)) && - !((i & IS_TEXT_UNICODE_UNICODE_MASK) && (i & IS_TEXT_UNICODE_REVERSE_MASK)) && - !(i & IS_TEXT_UNICODE_ODD_LENGTH) && - !(i & IS_TEXT_UNICODE_ILLEGAL_CHARS && !(i & IS_TEXT_UNICODE_REVERSE_SIGNATURE)) && - !((i & IS_TEXT_UNICODE_REVERSE_MASK) == IS_TEXT_UNICODE_REVERSE_STATISTICS))) { - - if (lpbBOM) - *lpbBOM = (bHasBOM || bHasRBOM || - (i & (IS_TEXT_UNICODE_SIGNATURE | IS_TEXT_UNICODE_REVERSE_SIGNATURE))) - ? TRUE : FALSE; - - if (lpbReverse) - *lpbReverse = (bHasRBOM || (i & IS_TEXT_UNICODE_REVERSE_MASK)) ? TRUE : FALSE; - - return TRUE; - } - - else - - return FALSE; -} - - -BOOL IsUTF8(const char* pTest,int nLength) { - static int byte_class_table[256] = { - /* 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F */ - /* 00 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 10 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 20 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 30 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 40 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 50 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 60 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 80 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - /* 90 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - /* A0 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - /* B0 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - /* C0 */ 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - /* D0 */ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - /* E0 */ 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 7, 7, - /* F0 */ 9,10,10,10,11, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 - /* 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F */ }; - - /* state table */ - typedef enum { - kSTART = 0,kA,kB,kC,kD,kE,kF,kG,kERROR,kNumOfStates - } utf8_state; - - static utf8_state state_table[] = { - /* kSTART, kA, kB, kC, kD, kE, kF, kG, kERROR */ - /* 0x00-0x7F: 0 */ kSTART, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, - /* 0x80-0x8F: 1 */ kERROR, kSTART, kA, kERROR, kA, kB, kERROR, kB, kERROR, - /* 0x90-0x9f: 2 */ kERROR, kSTART, kA, kERROR, kA, kB, kB, kERROR, kERROR, - /* 0xa0-0xbf: 3 */ kERROR, kSTART, kA, kA, kERROR, kB, kB, kERROR, kERROR, - /* 0xc0-0xc1, 0xf5-0xff: 4 */ kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, - /* 0xc2-0xdf: 5 */ kA, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, - /* 0xe0: 6 */ kC, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, - /* 0xe1-0xec, 0xee-0xef: 7 */ kB, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, - /* 0xed: 8 */ kD, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, - /* 0xf0: 9 */ kF, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, - /* 0xf1-0xf3: 10 */ kE, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, - /* 0xf4: 11 */ kG, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR, kERROR }; - -#define BYTE_CLASS(b) (byte_class_table[(unsigned char)b]) -#define NEXT_STATE(b,cur) (state_table[(BYTE_CLASS(b) * kNumOfStates) + (cur)]) - - utf8_state current = kSTART; - int i; - - const char* pt = pTest; - int len = nLength; - - for (i = 0; i < len; i++,pt++) { - - current = NEXT_STATE(*pt,current); - if (kERROR == current) - break; - } - - return (current == kSTART) ? TRUE : FALSE; -} - - -BOOL IsUTF7(const char* pTest,int nLength) { - int i; - const char *pt = pTest; - - for (i = 0; i < nLength; i++) { - if (*pt & 0x80 || !*pt) - return FALSE; - pt++; - } - - return TRUE; -} - - -/* byte length of UTF-8 sequence based on value of first byte. -for UTF-16 (21-bit space), max. code length is 4, so we only need to look -at 4 upper bits. -*/ -static const INT utf8_lengths[16] = -{ - 1,1,1,1,1,1,1,1, /* 0000 to 0111 : 1 byte (plain ASCII) */ - 0,0,0,0, /* 1000 to 1011 : not valid */ - 2,2, /* 1100, 1101 : 2 bytes */ - 3, /* 1110 : 3 bytes */ - 4 /* 1111 :4 bytes */ -}; - -/*++ -Function : -UTF8_mbslen_bytes [INTERNAL] - -Calculates the byte size of a NULL-terminated UTF-8 string. - -Parameters : -char *utf8_string : string to examine - -Return value : -size (in bytes) of a NULL-terminated UTF-8 string. --1 if invalid NULL-terminated UTF-8 string ---*/ -INT UTF8_mbslen_bytes(LPCSTR utf8_string) -{ - INT length = 0; - INT code_size; - BYTE byte; - - while (*utf8_string) { - byte = (BYTE)*utf8_string; - - if ((byte <= 0xF7) && (0 != (code_size = utf8_lengths[byte >> 4]))) { - length += code_size; - utf8_string += code_size; - } - else { - /* we got an invalid byte value but need to count it, - it will be later ignored during the string conversion */ - //WARN("invalid first byte value 0x%02X in UTF-8 sequence!\n",byte); - length++; - utf8_string++; - } - } - length++; /* include NULL terminator */ - return length; -} - -/*++ -Function : -UTF8_mbslen [INTERNAL] - -Calculates the character size of a NULL-terminated UTF-8 string. - -Parameters : -char *utf8_string : string to examine -int byte_length : byte size of string - -Return value : -size (in characters) of a UTF-8 string. --1 if invalid UTF-8 string ---*/ -INT UTF8_mbslen(LPCSTR source,INT byte_length) -{ - INT wchar_length = 0; - INT code_size; - BYTE byte; - - while (byte_length > 0) { - byte = (BYTE)*source; - - /* UTF-16 can't encode 5-byte and 6-byte sequences, so maximum value - for first byte is 11110111. Use lookup table to determine sequence - length based on upper 4 bits of first byte */ - if ((byte <= 0xF7) && (0 != (code_size = utf8_lengths[byte >> 4]))) { - /* 1 sequence == 1 character */ - wchar_length++; - - if (code_size == 4) - wchar_length++; - - source += code_size; /* increment pointer */ - byte_length -= code_size; /* decrement counter*/ - } - else { - /* - unlike UTF8_mbslen_bytes, we ignore the invalid characters. - we only report the number of valid characters we have encountered - to match the Windows behavior. - */ - //WARN("invalid byte 0x%02X in UTF-8 sequence, skipping it!\n", - // byte); - source++; - byte_length--; - } - } - return wchar_length; -} - - /** * Is the character an octal digit? @@ -3641,7 +2766,6 @@ static BOOL IsDigit(WCHAR wch) } - //============================================================================= // // UrlUnescapeEx() diff --git a/src/Helpers.h b/src/Helpers.h index 9ebcbcf0e..d991f2c60 100644 --- a/src/Helpers.h +++ b/src/Helpers.h @@ -22,7 +22,7 @@ #include #include -#include "TypeDefs.h" +#include "typedefs.h" // ============================================================================ @@ -164,6 +164,7 @@ BOOL IsCmdEnabled(HWND, UINT); #define DialogEnableWindow(hdlg, id, b) { HWND hctrl = GetDlgItem((hdlg),(id)); if (!(b)) { \ if (GetFocus() == hctrl) { SendMessage((hdlg), WM_NEXTDLGCTL, 0, FALSE); } }; EnableWindow(hctrl, (b)); } + #define GetString(id,pb,cb) LoadString(g_hInstance,id,pb,cb) #define StrEnd(pStart) (pStart + lstrlen(pStart)) @@ -347,84 +348,6 @@ inline int _StringCchCmpINW(PCNZWCH s1,int l1,PCNZWCH s2,int l2) { #define StringCchCompareIX(s1,s2) StringCchCompareIXA((s1),(s2)) #endif -// ===== File Encoding ===== - -extern int g_DOSEncoding; - -#define NCP_DEFAULT 1 -#define NCP_UTF8 2 -#define NCP_UTF8_SIGN 4 -#define NCP_UNICODE 8 -#define NCP_UNICODE_REVERSE 16 -#define NCP_UNICODE_BOM 32 -#define NCP_ANSI 64 -#define NCP_OEM 128 -#define NCP_MBCS 256 -#define NCP_INTERNAL (NCP_DEFAULT|NCP_UTF8|NCP_UTF8_SIGN|NCP_UNICODE|NCP_UNICODE_REVERSE|NCP_UNICODE_BOM|NCP_ANSI|NCP_OEM|NCP_MBCS) -#define NCP_EXTERNAL_8BIT 512 -#define NCP_RECODE 1024 - -#define CPI_GET -2 -#define CPI_NONE -1 -#define CPI_ANSI_DEFAULT 0 -#define CPI_OEM 1 -#define CPI_UNICODEBOM 2 -#define CPI_UNICODEBEBOM 3 -#define CPI_UNICODE 4 -#define CPI_UNICODEBE 5 -#define CPI_UTF8 6 -#define CPI_UTF8SIGN 7 -#define CPI_UTF7 8 - -#define IDS_ENCODINGNAME0 61000 -#define IDS_EOLMODENAME0 62000 - -typedef struct _np2encoding { - UINT uFlags; - UINT uCodePage; - char* pszParseNames; - int idsName; - WCHAR wchLabel[64]; -} NP2ENCODING; - -int Encoding_CountOf(); -int Encoding_Current(int); // getter/setter -int Encoding_SrcCmdLn(int); // getter/setter -int Encoding_SrcWeak(int); // getter/setter -BOOL Encoding_HasChanged(int); // query/setter - -void Encoding_InitDefaults(); -int Encoding_MapIniSetting(BOOL,int); -int Encoding_MapUnicode(int); -void Encoding_SetLabel(int); -int Encoding_MatchW(LPCWSTR); -int Encoding_MatchA(char*); -BOOL Encoding_IsValid(int); -int Encoding_GetByCodePage(UINT); -void Encoding_AddToListView(HWND,int,BOOL); -BOOL Encoding_GetFromListView(HWND,int *); -void Encoding_AddToComboboxEx(HWND,int,BOOL); -BOOL Encoding_GetFromComboboxEx(HWND,int *); -BOOL Encoding_IsDefault(int); -BOOL Encoding_IsANSI(int); -BOOL Encoding_IsOEM(int); - -UINT Encoding_SciGetCodePage(HWND); -int Encoding_SciMappedCodePage(int); -void Encoding_SciSetCodePage(HWND,int); - - -BOOL IsUnicode(const char*,int,LPBOOL,LPBOOL); -BOOL IsUTF8(const char*,int); -BOOL IsUTF7(const char*,int); - -#define IsUTF8Signature(p) ((*(p+0) == '\xEF' && *(p+1) == '\xBB' && *(p+2) == '\xBF')) -#define UTF8StringStart(p) (IsUTF8Signature(p)) ? (p+3) : (p) - -INT UTF8_mbslen_bytes(LPCSTR utf8_string); -INT UTF8_mbslen(LPCSTR source,INT byte_length); - - void UrlUnescapeEx(LPWSTR, LPWSTR, DWORD*); // -------------------------------------------------------------------------------------------------------------------------------- diff --git a/src/Notepad3.c b/src/Notepad3.c index b77badf03..30b3c8f8d 100644 --- a/src/Notepad3.c +++ b/src/Notepad3.c @@ -35,6 +35,7 @@ #include //#include #include + #include "scintilla.h" #include "scilexer.h" #include "edit.h" @@ -43,10 +44,13 @@ #include "resource.h" #include "../crypto/crypto.h" #include "../uthash/utarray.h" +#include "encoding.h" #include "helpers.h" -#include "notepad3.h" #include "SciCall.h" +#include "notepad3.h" + + /****************************************************************************** * @@ -288,8 +292,6 @@ EDITFINDREPLACE g_efrData = EFR_INIT_DATA; UINT cpLastFind = 0; BOOL bReplaceInitialized = FALSE; -extern NP2ENCODING g_Encodings[]; - int iLineEndings[3] = { SC_EOL_CRLF, SC_EOL_LF, @@ -2111,15 +2113,15 @@ void MsgInitMenu(HWND hwnd,WPARAM wParam,LPARAM lParam) EnableCmd(hmenu,IDM_ENCODING_RECODE,i); - if (g_Encodings[Encoding_Current(CPI_GET)].uFlags & NCP_UNICODE_REVERSE) + if (Encoding_IsUNICODE_REVERSE(Encoding_Current(CPI_GET))) i = IDM_ENCODING_UNICODEREV; - else if (g_Encodings[Encoding_Current(CPI_GET)].uFlags & NCP_UNICODE) + else if (Encoding_IsUNICODE(Encoding_Current(CPI_GET))) i = IDM_ENCODING_UNICODE; - else if (g_Encodings[Encoding_Current(CPI_GET)].uFlags & NCP_UTF8_SIGN) + else if (Encoding_IsUTF8_SIGN(Encoding_Current(CPI_GET))) i = IDM_ENCODING_UTF8SIGN; - else if (g_Encodings[Encoding_Current(CPI_GET)].uFlags & NCP_UTF8) + else if (Encoding_IsUTF8(Encoding_Current(CPI_GET))) i = IDM_ENCODING_UTF8; - else if (g_Encodings[Encoding_Current(CPI_GET)].uFlags & NCP_ANSI) + else if (Encoding_IsANSI(Encoding_Current(CPI_GET))) i = IDM_ENCODING_ANSI; else i = -1; @@ -2222,7 +2224,7 @@ void MsgInitMenu(HWND hwnd,WPARAM wParam,LPARAM lParam) i == SCLEX_AU3 || i == SCLEX_LATEX || i == SCLEX_AHK || i == SCLEX_RUBY || i == SCLEX_CMAKE || i == SCLEX_MARKDOWN || i == SCLEX_YAML || i == SCLEX_REGISTRY || i == SCLEX_NIMROD)); - EnableCmd(hmenu,IDM_EDIT_INSERT_ENCODING,*g_Encodings[Encoding_Current(CPI_GET)].pszParseNames); + EnableCmd(hmenu,IDM_EDIT_INSERT_ENCODING, *Encoding_GetParseNames(Encoding_Current(CPI_GET))); //EnableCmd(hmenu,IDM_EDIT_INSERT_SHORTDATE,!bReadOnly); //EnableCmd(hmenu,IDM_EDIT_INSERT_LONGDATE,!bReadOnly); @@ -3335,10 +3337,10 @@ LRESULT MsgCommand(HWND hwnd, WPARAM wParam, LPARAM lParam) case IDM_EDIT_INSERT_ENCODING: { - if (*g_Encodings[Encoding_Current(CPI_GET)].pszParseNames) { + if (*Encoding_GetParseNames(Encoding_Current(CPI_GET))) { char msz[32] = { '\0' }; //int iSelStart; - StringCchCopyNA(msz,COUNTOF(msz),g_Encodings[Encoding_Current(CPI_GET)].pszParseNames,COUNTOF(msz)); + StringCchCopyNA(msz,COUNTOF(msz), Encoding_GetParseNames(Encoding_Current(CPI_GET)),COUNTOF(msz)); char *p = StrChrA(msz, ','); if (p) *p = 0; @@ -6018,7 +6020,7 @@ void LoadSettings() */ // set flag for encoding default - g_Encodings[g_iDefaultNewFileEncoding].uFlags |= NCP_DEFAULT; + Encoding_SetDefaultFlag(g_iDefaultNewFileEncoding); // define default charset g_iDefaultCharSet = (int)CharSetFromCodePage((UINT)iSciDefaultCodePage); @@ -7057,7 +7059,7 @@ void UpdateStatusbar() FormatString(tchDocSize, COUNTOF(tchDocSize), IDS_DOCSIZE, tchBytes); Encoding_SetLabel(iEncoding); - StringCchPrintf(tchEncoding, COUNTOF(tchEncoding), L" %s ", g_Encodings[iEncoding].wchLabel); + StringCchPrintf(tchEncoding, COUNTOF(tchEncoding), L" %s ", Encoding_GetLabel(iEncoding)); if (g_iEOLMode == SC_EOL_CR) { diff --git a/src/Notepad3.vcxproj b/src/Notepad3.vcxproj index 72f78cdba..101a5e840 100644 --- a/src/Notepad3.vcxproj +++ b/src/Notepad3.vcxproj @@ -321,6 +321,7 @@ + @@ -334,6 +335,7 @@ + diff --git a/src/Notepad3.vcxproj.filters b/src/Notepad3.vcxproj.filters index a70098f45..ca9fc7d5a 100644 --- a/src/Notepad3.vcxproj.filters +++ b/src/Notepad3.vcxproj.filters @@ -54,6 +54,9 @@ Crypto + + Source Files + @@ -101,6 +104,9 @@ Header Files + + Header Files + diff --git a/src/Print.cpp b/src/Print.cpp index 62cabf3c8..119c4606c 100644 --- a/src/Print.cpp +++ b/src/Print.cpp @@ -6,9 +6,10 @@ * Print.cpp * * Scintilla Printing Functionality * * Based on code from Notepad2, (c) Florian Balmer 1996-2011 * -* Mostly taken from SciTE, (c) Neil Hodgson * * * -* (c) Rizonesoft 2008-2016 * +* Mostly taken from SciTE, (c) Neil Hodgson * +* * +* (c) Rizonesoft 2015-2018 * * https://rizonesoft.com * * * * * @@ -34,6 +35,7 @@ #include "scintilla.h" #include "scilexer.h" #include "resource.h" + extern "C" { #include "dialogs.h" #include "helpers.h" diff --git a/src/Styles.c b/src/Styles.c index facf6f978..949046212 100644 --- a/src/Styles.c +++ b/src/Styles.c @@ -30,16 +30,18 @@ #include #include #include + #include "scintilla.h" #include "scilexer.h" #include "notepad3.h" #include "edit.h" #include "dialogs.h" #include "resource.h" +#include "encoding.h" #include "helpers.h" -#include "styles.h" #include "SciCall.h" +#include "styles.h" extern HINSTANCE g_hInstance; From 504c9d9404e181d95e1c6c9249f5313f8839f1f9 Mon Sep 17 00:00:00 2001 From: Rainer Kottenhoff Date: Fri, 2 Mar 2018 02:22:05 +0100 Subject: [PATCH 5/6] + enh: prepare enhanced ANSI codepage detection --- src/Encoding.c | 761 ++++++++++++++++++++++++------------------------ src/Notepad3.rc | 2 + 2 files changed, 381 insertions(+), 382 deletions(-) diff --git a/src/Encoding.c b/src/Encoding.c index 2c91e51e3..0d6bba481 100644 --- a/src/Encoding.c +++ b/src/Encoding.c @@ -30,7 +30,7 @@ #include #include -//#include "../uthash/utarray.h" +#include "../uthash/utarray.h" #include "scintilla.h" #include "helpers.h" @@ -52,149 +52,151 @@ WCHAR wchANSI[16] = { L'\0' }; WCHAR wchOEM[16] = { L'\0' }; static NP2ENCODING g_Encodings[] = { -{ NCP_ANSI | NCP_RECODE, CP_ACP, "ansi,system,ascii,", 61000, L"" }, -{ NCP_OEM | NCP_RECODE, CP_OEMCP, "oem,oem,", 61001, L"" }, -{ NCP_UNICODE | NCP_UNICODE_BOM, CP_UTF8, "", 61002, L"" }, -{ NCP_UNICODE | NCP_UNICODE_REVERSE | NCP_UNICODE_BOM, CP_UTF8, "", 61003, L"" }, -{ NCP_UNICODE | NCP_RECODE, CP_UTF8, "utf-16,utf16,unicode,", 61004, L"" }, -{ NCP_UNICODE | NCP_UNICODE_REVERSE | NCP_RECODE, CP_UTF8, "utf-16be,utf16be,unicodebe,", 61005, L"" }, -{ NCP_UTF8 | NCP_RECODE, CP_UTF8, "utf-8,utf8,", 61006, L"" }, -{ NCP_UTF8 | NCP_UTF8_SIGN, CP_UTF8, "utf-8,utf8,", 61007, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, CP_UTF7, "utf-7,utf7,", 61008, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 720, "DOS-720,dos720,", 61009, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28596, "iso-8859-6,iso88596,arabic,csisolatinarabic,ecma114,isoir127,", 61010, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10004, "x-mac-arabic,xmacarabic,", 61011, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1256, "windows-1256,windows1256,cp1256", 61012, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 775, "ibm775,ibm775,cp500,", 61013, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28594, "iso-8859-4,iso88594,csisolatin4,isoir110,l4,latin4,", 61014, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1257, "windows-1257,windows1257,", 61015, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 852, "ibm852,ibm852,cp852,", 61016, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28592, "iso-8859-2,iso88592,csisolatin2,isoir101,latin2,l2,", 61017, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10029, "x-mac-ce,xmacce,", 61018, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1250, "windows-1250,windows1250,xcp1250,", 61019, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 936, "gb2312,gb2312,chinese,cngb,csgb2312,csgb231280,gb231280,gbk,", 61020, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10008, "x-mac-chinesesimp,xmacchinesesimp,", 61021, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 950, "big5,big5,cnbig5,csbig5,xxbig5,", 61022, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10002, "x-mac-chinesetrad,xmacchinesetrad,", 61023, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10082, "x-mac-croatian,xmaccroatian,", 61024, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 866, "cp866,cp866,ibm866,", 61025, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28595, "iso-8859-5,iso88595,csisolatin5,csisolatincyrillic,cyrillic,isoir144,", 61026, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 20866, "koi8-r,koi8r,cskoi8r,koi,koi8,", 61027, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 21866, "koi8-u,koi8u,koi8ru,", 61028, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10007, "x-mac-cyrillic,xmaccyrillic,", 61029, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1251, "windows-1251,windows1251,xcp1251,", 61030, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28603, "iso-8859-13,iso885913,", 61031, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 863, "ibm863,ibm863,", 61032, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 737, "ibm737,ibm737,", 61033, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28597, "iso-8859-7,iso88597,csisolatingreek,ecma118,elot928,greek,greek8,isoir126,", 61034, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10006, "x-mac-greek,xmacgreek,", 61035, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1253, "windows-1253,windows1253,", 61036, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 869, "ibm869,ibm869,", 61037, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 862, "DOS-862,dos862,", 61038, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 38598, "iso-8859-8-i,iso88598i,logical,", 61039, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28598, "iso-8859-8,iso88598,csisolatinhebrew,hebrew,isoir138,visual,", 61040, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10005, "x-mac-hebrew,xmachebrew,", 61041, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1255, "windows-1255,windows1255,", 61042, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 861, "ibm861,ibm861,", 61043, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10079, "x-mac-icelandic,xmacicelandic,", 61044, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10001, "x-mac-japanese,xmacjapanese,", 61045, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 932, "shift_jis,shiftjis,shiftjs,csshiftjis,cswindows31j,mskanji,xmscp932,xsjis,", 61046, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10003, "x-mac-korean,xmackorean,", 61047, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 949, "windows-949,windows949,ksc56011987,csksc5601,euckr,isoir149,korean,ksc56011989", 61048, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28593, "iso-8859-3,iso88593,latin3,isoir109,l3,", 61049, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28605, "iso-8859-15,iso885915,latin9,l9,", 61050, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 865, "ibm865,ibm865,", 61051, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 437, "ibm437,ibm437,437,cp437,cspc8,codepage437,", 61052, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 858, "ibm858,ibm858,ibm00858,", 61053, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 860, "ibm860,ibm860,", 61054, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10010, "x-mac-romanian,xmacromanian,", 61055, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10021, "x-mac-thai,xmacthai,", 61056, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 874, "windows-874,windows874,dos874,iso885911,tis620,", 61057, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 857, "ibm857,ibm857,", 61058, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28599, "iso-8859-9,iso88599,latin5,isoir148,l5,", 61059, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10081, "x-mac-turkish,xmacturkish,", 61060, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1254, "windows-1254,windows1254,", 61061, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10017, "x-mac-ukrainian,xmacukrainian,", 61062, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1258, "windows-1258,windows-258,", 61063, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 850, "ibm850,ibm850,", 61064, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28591, "iso-8859-1,iso88591,cp819,latin1,ibm819,isoir100,latin1,l1,", 61065, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10000, "macintosh,macintosh,", 61066, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1252, "windows-1252,windows1252,cp367,cp819,ibm367,us,xansi,", 61067, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 37, "ebcdic-cp-us,ebcdiccpus,ebcdiccpca,ebcdiccpwt,ebcdiccpnl,ibm037,cp037,", 61068, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 500, "x-ebcdic-international,xebcdicinternational,", 61069, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 875, "x-EBCDIC-GreekModern,xebcdicgreekmodern,", 61070, L"" }, -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1026, "CP1026,cp1026,csibm1026,ibm1026,", 61071, L"" }, -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 870, "CP870,cp870,ebcdiccproece,ebcdiccpyu,csibm870,ibm870,", 00000, L"" }, // IBM EBCDIC (Multilingual Latin-2) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1047, "IBM01047,ibm01047,", 00000, L"" }, // IBM EBCDIC (Open System Latin-1) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1140, "x-ebcdic-cp-us-euro,xebcdiccpuseuro,", 00000, L"" }, // IBM EBCDIC (US-Canada-Euro) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1141, "x-ebcdic-germany-euro,xebcdicgermanyeuro,", 00000, L"" }, // IBM EBCDIC (Germany-Euro) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1142, "x-ebcdic-denmarknorway-euro,xebcdicdenmarknorwayeuro,", 00000, L"" }, // IBM EBCDIC (Denmark-Norway-Euro) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1143, "x-ebcdic-finlandsweden-euro,xebcdicfinlandswedeneuro,", 00000, L"" }, // IBM EBCDIC (Finland-Sweden-Euro) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1144, "x-ebcdic-italy-euro,xebcdicitalyeuro,", 00000, L"" }, // IBM EBCDIC (Italy-Euro) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1145, "x-ebcdic-spain-euro,xebcdicspaineuro,", 00000, L"" }, // IBM EBCDIC (Spain-Latin America-Euro) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1146, "x-ebcdic-uk-euro,xebcdicukeuro,", 00000, L"" }, // IBM EBCDIC (UK-Euro) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1147, "x-ebcdic-france-euro,xebcdicfranceeuro,", 00000, L"" }, // IBM EBCDIC (France-Euro) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1148, "x-ebcdic-international-euro,xebcdicinternationaleuro,", 00000, L"" }, // IBM EBCDIC (International-Euro) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1149, "x-ebcdic-icelandic-euro,xebcdicicelandiceuro,", 00000, L"" }, // IBM EBCDIC (Icelandic-Euro) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1361, "johab,johab,", 00000, L"" }, // Korean (Johab) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20273, "x-EBCDIC-Germany,xebcdicgermany,", 00000, L"" }, // IBM EBCDIC (Germany) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20277, "x-EBCDIC-DenmarkNorway,xebcdicdenmarknorway,ebcdiccpdk,ebcdiccpno,", 00000, L"" }, // IBM EBCDIC (Denmark-Norway) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20278, "x-EBCDIC-FinlandSweden,xebcdicfinlandsweden,ebcdicpfi,ebcdiccpse,", 00000, L"" }, // IBM EBCDIC (Finland-Sweden) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20280, "x-EBCDIC-Italy,xebcdicitaly,", 00000, L"" }, // IBM EBCDIC (Italy) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20284, "x-EBCDIC-Spain,xebcdicspain,ebcdiccpes,", 00000, L"" }, // IBM EBCDIC (Spain-Latin America) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20285, "x-EBCDIC-UK,xebcdicuk,ebcdiccpgb,", 00000, L"" }, // IBM EBCDIC (UK) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20290, "x-EBCDIC-JapaneseKatakana,xebcdicjapanesekatakana,", 00000, L"" }, // IBM EBCDIC (Japanese Katakana) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20297, "x-EBCDIC-France,xebcdicfrance,ebcdiccpfr,", 00000, L"" }, // IBM EBCDIC (France) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20420, "x-EBCDIC-Arabic,xebcdicarabic,ebcdiccpar1,", 00000, L"" }, // IBM EBCDIC (Arabic) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20423, "x-EBCDIC-Greek,xebcdicgreek,ebcdiccpgr,", 00000, L"" }, // IBM EBCDIC (Greek) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20424, "x-EBCDIC-Hebrew,xebcdichebrew,ebcdiccphe,", 00000, L"" }, // IBM EBCDIC (Hebrew) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20833, "x-EBCDIC-KoreanExtended,xebcdickoreanextended,", 00000, L"" }, // IBM EBCDIC (Korean Extended) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20838, "x-EBCDIC-Thai,xebcdicthai,ibmthai,csibmthai,", 00000, L"" }, // IBM EBCDIC (Thai) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20871, "x-EBCDIC-Icelandic,xebcdicicelandic,ebcdiccpis,", 00000, L"" }, // IBM EBCDIC (Icelandic) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20880, "x-EBCDIC-CyrillicRussian,xebcdiccyrillicrussian,ebcdiccyrillic,", 00000, L"" }, // IBM EBCDIC (Cyrillic Russian) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20905, "x-EBCDIC-Turkish,xebcdicturkish,ebcdiccptr,", 00000, L"" }, // IBM EBCDIC (Turkish) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20924, "IBM00924,ibm00924,ebcdiclatin9euro,", 00000, L"" }, // IBM EBCDIC (Open System-Euro Latin-1) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 21025, "x-EBCDIC-CyrillicSerbianBulgarian,xebcdiccyrillicserbianbulgarian,", 00000, L"" }, // IBM EBCDIC (Cyrillic Serbian-Bulgarian) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50930, "x-EBCDIC-JapaneseAndKana,xebcdicjapaneseandkana,", 00000, L"" }, // IBM EBCDIC (Japanese and Japanese Katakana) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50931, "x-EBCDIC-JapaneseAndUSCanada,xebcdicjapaneseanduscanada,", 00000, L"" }, // IBM EBCDIC (Japanese and US-Canada) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50933, "x-EBCDIC-KoreanAndKoreanExtended,xebcdickoreanandkoreanextended,", 00000, L"" }, // IBM EBCDIC (Korean and Korean Extended) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50935, "x-EBCDIC-SimplifiedChinese,xebcdicsimplifiedchinese,", 00000, L"" }, // IBM EBCDIC (Chinese Simplified) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50937, "x-EBCDIC-TraditionalChinese,xebcdictraditionalchinese,", 00000, L"" }, // IBM EBCDIC (Chinese Traditional) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50939, "x-EBCDIC-JapaneseAndJapaneseLatin,xebcdicjapaneseandjapaneselatin,", 00000, L"" }, // IBM EBCDIC (Japanese and Japanese-Latin) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20105, "x-IA5,xia5,", 00000, L"" }, // Western European (IA5) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20106, "x-IA5-German,xia5german,", 00000, L"" }, // German (IA5) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20107, "x-IA5-Swedish,xia5swedish,", 00000, L"" }, // Swedish (IA5) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20108, "x-IA5-Norwegian,xia5norwegian,", 00000, L"" }, // Norwegian (IA5) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20936, "x-cp20936,xcp20936,", 00000, L"" }, // Chinese Simplified (GB2312) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20932, "euc-jp,,", 00000, L"" }, // Japanese (JIS X 0208-1990 & 0212-1990) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50220, "iso-2022-jp,iso2022jp,", 00000, L"" }, // Japanese (JIS) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50221, "csISO2022JP,csiso2022jp,", 00000, L"" }, // Japanese (JIS-Allow 1 byte Kana) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50222, "_iso-2022-jp$SIO,iso2022jpSIO,", 00000, L"" }, // Japanese (JIS-Allow 1 byte Kana - SO/SI) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50225, "iso-2022-kr,iso2022kr,csiso2022kr,", 00000, L"" }, // Korean (ISO-2022-KR) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50227, "x-cp50227,xcp50227,", 00000, L"" }, // Chinese Simplified (ISO-2022) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50229, "iso-2022-cn,iso2022cn,", 00000, L"" }, // Chinese Traditional (ISO-2022) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20000, "x-Chinese-CNS,xchinesecns,", 00000, L"" }, // Chinese Traditional (CNS) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20002, "x-Chinese-Eten,xchineseeten,", 00000, L"" }, // Chinese Traditional (Eten) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 51932, "euc-jp,eucjp,xeuc,xeucjp,", 00000, L"" }, // Japanese (EUC) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 51936, "euc-cn,euccn,xeuccn,", 00000, L"" }, // Chinese Simplified (EUC) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 51949, "euc-kr,euckr,cseuckr,", 00000, L"" }, // Korean (EUC) -//{ NCP_EXTERNAL_8BIT|NCP_RECODE, 52936, "hz-gb-2312,hzgb2312,hz,", 00000, L"" }, // Chinese Simplified (HZ-GB2312) -{ NCP_EXTERNAL_8BIT | NCP_RECODE, 54936, "gb18030,gb18030,", 61072, L"" } // Chinese Simplified (GB18030) - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57002, "x-iscii-de,xisciide,", 00000, L"" }, // ISCII Devanagari - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57003, "x-iscii-be,xisciibe,", 00000, L"" }, // ISCII Bengali - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57004, "x-iscii-ta,xisciita,", 00000, L"" }, // ISCII Tamil - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57005, "x-iscii-te,xisciite,", 00000, L"" }, // ISCII Telugu - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57006, "x-iscii-as,xisciias,", 00000, L"" }, // ISCII Assamese - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57007, "x-iscii-or,xisciior,", 00000, L"" }, // ISCII Oriya - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57008, "x-iscii-ka,xisciika,", 00000, L"" }, // ISCII Kannada - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57009, "x-iscii-ma,xisciima,", 00000, L"" }, // ISCII Malayalam - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57010, "x-iscii-gu,xisciigu,", 00000, L"" }, // ISCII Gujarathi - //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57011, "x-iscii-pa,xisciipa,", 00000, L"" }, // ISCII Panjabi + /* 000 */{ NCP_ANSI | NCP_RECODE, CP_ACP, "ansi,system,ascii,", 61000, L"" }, + /* 001 */{ NCP_OEM | NCP_RECODE, CP_OEMCP, "oem,oem,", 61001, L"" }, + /* 002 */{ NCP_UNICODE | NCP_UNICODE_BOM, CP_UTF8, "", 61002, L"" }, + /* 003 */{ NCP_UNICODE | NCP_UNICODE_REVERSE | NCP_UNICODE_BOM, CP_UTF8, "", 61003, L"" }, + /* 004 */{ NCP_UNICODE | NCP_RECODE, CP_UTF8, "utf-16,utf16,unicode,", 61004, L"" }, + /* 005 */{ NCP_UNICODE | NCP_UNICODE_REVERSE | NCP_RECODE, CP_UTF8, "utf-16be,utf16be,unicodebe,", 61005, L"" }, + /* 006 */{ NCP_UTF8 | NCP_RECODE, CP_UTF8, "utf-8,utf8,", 61006, L"" }, + /* 007 */{ NCP_UTF8 | NCP_UTF8_SIGN, CP_UTF8, "utf-8,utf8,", 61007, L"" }, + /* 008 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, CP_UTF7, "utf-7,utf7,", 61008, L"" }, + /* 009 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 720, "DOS-720,dos720,", 61009, L"" }, + /* 010 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28596, "iso-8859-6,iso88596,arabic,csisolatinarabic,ecma114,isoir127,", 61010, L"" }, + /* 011 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10004, "x-mac-arabic,xmacarabic,", 61011, L"" }, + /* 012 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1256, "windows-1256,windows1256,cp1256", 61012, L"" }, + /* 013 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 775, "ibm775,ibm775,cp500,", 61013, L"" }, + /* 014 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28594, "iso-8859-4,iso88594,csisolatin4,isoir110,l4,latin4,", 61014, L"" }, + /* 015 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1257, "windows-1257,windows1257,", 61015, L"" }, + /* 016 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 852, "ibm852,ibm852,cp852,", 61016, L"" }, + /* 017 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28592, "iso-8859-2,iso88592,csisolatin2,isoir101,latin2,l2,", 61017, L"" }, + /* 018 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10029, "x-mac-ce,xmacce,", 61018, L"" }, + /* 019 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1250, "windows-1250,windows1250,xcp1250,", 61019, L"" }, + /* 020 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 936, "gb2312,gb2312,chinese,cngb,csgb2312,csgb231280,gb231280,gbk,", 61020, L"" }, + /* 021 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10008, "x-mac-chinesesimp,xmacchinesesimp,", 61021, L"" }, + /* 022 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 950, "big5,big5,cnbig5,csbig5,xxbig5,", 61022, L"" }, + /* 023 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10002, "x-mac-chinesetrad,xmacchinesetrad,", 61023, L"" }, + /* 024 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10082, "x-mac-croatian,xmaccroatian,", 61024, L"" }, + /* 025 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 866, "cp866,cp866,ibm866,", 61025, L"" }, + /* 026 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28595, "iso-8859-5,iso88595,csisolatin5,csisolatincyrillic,cyrillic,isoir144,", 61026, L"" }, + /* 027 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 20866, "koi8-r,koi8r,cskoi8r,koi,koi8,", 61027, L"" }, + /* 028 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 21866, "koi8-u,koi8u,koi8ru,", 61028, L"" }, + /* 029 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10007, "x-mac-cyrillic,xmaccyrillic,", 61029, L"" }, + /* 030 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1251, "windows-1251,windows1251,xcp1251,", 61030, L"" }, + /* 031 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28603, "iso-8859-13,iso885913,", 61031, L"" }, + /* 032 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 863, "ibm863,ibm863,", 61032, L"" }, + /* 033 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 737, "ibm737,ibm737,", 61033, L"" }, + /* 034 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28597, "iso-8859-7,iso88597,csisolatingreek,ecma118,elot928,greek,greek8,isoir126,", 61034, L"" }, + /* 035 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10006, "x-mac-greek,xmacgreek,", 61035, L"" }, + /* 036 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1253, "windows-1253,windows1253,", 61036, L"" }, + /* 037 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 869, "ibm869,ibm869,", 61037, L"" }, + /* 038 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 862, "DOS-862,dos862,", 61038, L"" }, + /* 039 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 38598, "iso-8859-8-i,iso88598i,logical,", 61039, L"" }, + /* 040 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28598, "iso-8859-8,iso88598,csisolatinhebrew,hebrew,isoir138,visual,", 61040, L"" }, + /* 041 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10005, "x-mac-hebrew,xmachebrew,", 61041, L"" }, + /* 042 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1255, "windows-1255,windows1255,", 61042, L"" }, + /* 043 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 861, "ibm861,ibm861,", 61043, L"" }, + /* 044 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10079, "x-mac-icelandic,xmacicelandic,", 61044, L"" }, + /* 045 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10001, "x-mac-japanese,xmacjapanese,", 61045, L"" }, + /* 046 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 932, "shift_jis,shiftjis,shiftjs,csshiftjis,cswindows31j,mskanji,xmscp932,xsjis,", 61046, L"" }, + /* 047 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10003, "x-mac-korean,xmackorean,", 61047, L"" }, + /* 048 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 949, "windows-949,windows949,ksc56011987,csksc5601,euckr,isoir149,korean,ksc56011989", 61048, L"" }, + /* 049 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28593, "iso-8859-3,iso88593,latin3,isoir109,l3,", 61049, L"" }, + /* 050 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28605, "iso-8859-15,iso885915,latin9,l9,", 61050, L"" }, + /* 051 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 865, "ibm865,ibm865,", 61051, L"" }, + /* 052 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 437, "ibm437,ibm437,437,cp437,cspc8,codepage437,", 61052, L"" }, + /* 053 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 858, "ibm858,ibm858,ibm00858,", 61053, L"" }, + /* 054 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 860, "ibm860,ibm860,", 61054, L"" }, + /* 055 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10010, "x-mac-romanian,xmacromanian,", 61055, L"" }, + /* 056 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10021, "x-mac-thai,xmacthai,", 61056, L"" }, + /* 057 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 874, "windows-874,windows874,dos874,iso885911,tis620,", 61057, L"" }, + /* 058 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 857, "ibm857,ibm857,", 61058, L"" }, + /* 059 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28599, "iso-8859-9,iso88599,latin5,isoir148,l5,", 61059, L"" }, + /* 060 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10081, "x-mac-turkish,xmacturkish,", 61060, L"" }, + /* 061 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1254, "windows-1254,windows1254,", 61061, L"" }, + /* 062 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10017, "x-mac-ukrainian,xmacukrainian,", 61062, L"" }, + /* 063 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1258, "windows-1258,windows-258,", 61063, L"" }, + /* 064 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 850, "ibm850,ibm850,", 61064, L"" }, + /* 065 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28591, "iso-8859-1,iso88591,cp819,latin1,ibm819,isoir100,latin1,l1,", 61065, L"" }, + /* 066 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 10000, "macintosh,macintosh,", 61066, L"" }, + /* 067 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1252, "windows-1252,windows1252,cp367,cp819,ibm367,us,xansi,", 61067, L"" }, + /* 068 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 37, "ebcdic-cp-us,ebcdiccpus,ebcdiccpca,ebcdiccpwt,ebcdiccpnl,ibm037,cp037,", 61068, L"" }, + /* 069 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 500, "x-ebcdic-international,xebcdicinternational,", 61069, L"" }, + /* 070 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 875, "x-EBCDIC-GreekModern,xebcdicgreekmodern,", 61070, L"" }, + /* 071 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1026, "CP1026,cp1026,csibm1026,ibm1026,", 61071, L"" }, + /* 072 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 54936, "gb18030,gb18030,", 61072, L"" }, // Chinese Simplified (GB18030) + /* 073 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 51932, "euc-jp,eucjp,xeuc,xeucjp,", 61073, L"" }, // Japanese (EUC) + /* 074 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 51949, "euc-kr,euckr,cseuckr,", 61074, L"" } // Korean (EUC) +/* 073 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 870, "CP870,cp870,ebcdiccproece,ebcdiccpyu,csibm870,ibm870,", 00000, L"" }, // IBM EBCDIC (Multilingual Latin-2) +/* 074 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1047, "IBM01047,ibm01047,", 00000, L"" }, // IBM EBCDIC (Open System Latin-1) +/* 075 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1140, "x-ebcdic-cp-us-euro,xebcdiccpuseuro,", 00000, L"" }, // IBM EBCDIC (US-Canada-Euro) +/* 076 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1141, "x-ebcdic-germany-euro,xebcdicgermanyeuro,", 00000, L"" }, // IBM EBCDIC (Germany-Euro) +/* 077 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1142, "x-ebcdic-denmarknorway-euro,xebcdicdenmarknorwayeuro,", 00000, L"" }, // IBM EBCDIC (Denmark-Norway-Euro) +/* 078 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1143, "x-ebcdic-finlandsweden-euro,xebcdicfinlandswedeneuro,", 00000, L"" }, // IBM EBCDIC (Finland-Sweden-Euro) +/* 079 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1144, "x-ebcdic-italy-euro,xebcdicitalyeuro,", 00000, L"" }, // IBM EBCDIC (Italy-Euro) +/* 080 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1145, "x-ebcdic-spain-euro,xebcdicspaineuro,", 00000, L"" }, // IBM EBCDIC (Spain-Latin America-Euro) +/* 081 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1146, "x-ebcdic-uk-euro,xebcdicukeuro,", 00000, L"" }, // IBM EBCDIC (UK-Euro) +/* 082 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1147, "x-ebcdic-france-euro,xebcdicfranceeuro,", 00000, L"" }, // IBM EBCDIC (France-Euro) +/* 083 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1148, "x-ebcdic-international-euro,xebcdicinternationaleuro,", 00000, L"" }, // IBM EBCDIC (International-Euro) +/* 084 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1149, "x-ebcdic-icelandic-euro,xebcdicicelandiceuro,", 00000, L"" }, // IBM EBCDIC (Icelandic-Euro) +/* 085 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 1361, "johab,johab,", 00000, L"" }, // Korean (Johab) +/* 086 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20273, "x-EBCDIC-Germany,xebcdicgermany,", 00000, L"" }, // IBM EBCDIC (Germany) +/* 087 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20277, "x-EBCDIC-DenmarkNorway,xebcdicdenmarknorway,ebcdiccpdk,ebcdiccpno,", 00000, L"" }, // IBM EBCDIC (Denmark-Norway) +/* 088 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20278, "x-EBCDIC-FinlandSweden,xebcdicfinlandsweden,ebcdicpfi,ebcdiccpse,", 00000, L"" }, // IBM EBCDIC (Finland-Sweden) +/* 089 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20280, "x-EBCDIC-Italy,xebcdicitaly,", 00000, L"" }, // IBM EBCDIC (Italy) +/* 090 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20284, "x-EBCDIC-Spain,xebcdicspain,ebcdiccpes,", 00000, L"" }, // IBM EBCDIC (Spain-Latin America) +/* 091 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20285, "x-EBCDIC-UK,xebcdicuk,ebcdiccpgb,", 00000, L"" }, // IBM EBCDIC (UK) +/* 092 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20290, "x-EBCDIC-JapaneseKatakana,xebcdicjapanesekatakana,", 00000, L"" }, // IBM EBCDIC (Japanese Katakana) +/* 093 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20297, "x-EBCDIC-France,xebcdicfrance,ebcdiccpfr,", 00000, L"" }, // IBM EBCDIC (France) +/* 094 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20420, "x-EBCDIC-Arabic,xebcdicarabic,ebcdiccpar1,", 00000, L"" }, // IBM EBCDIC (Arabic) +/* 095 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20423, "x-EBCDIC-Greek,xebcdicgreek,ebcdiccpgr,", 00000, L"" }, // IBM EBCDIC (Greek) +/* 096 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20424, "x-EBCDIC-Hebrew,xebcdichebrew,ebcdiccphe,", 00000, L"" }, // IBM EBCDIC (Hebrew) +/* 097 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20833, "x-EBCDIC-KoreanExtended,xebcdickoreanextended,", 00000, L"" }, // IBM EBCDIC (Korean Extended) +/* 098 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20838, "x-EBCDIC-Thai,xebcdicthai,ibmthai,csibmthai,", 00000, L"" }, // IBM EBCDIC (Thai) +/* 099 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20871, "x-EBCDIC-Icelandic,xebcdicicelandic,ebcdiccpis,", 00000, L"" }, // IBM EBCDIC (Icelandic) +/* 100 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20880, "x-EBCDIC-CyrillicRussian,xebcdiccyrillicrussian,ebcdiccyrillic,", 00000, L"" }, // IBM EBCDIC (Cyrillic Russian) +/* 101 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20905, "x-EBCDIC-Turkish,xebcdicturkish,ebcdiccptr,", 00000, L"" }, // IBM EBCDIC (Turkish) +/* 102 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20924, "IBM00924,ibm00924,ebcdiclatin9euro,", 00000, L"" }, // IBM EBCDIC (Open System-Euro Latin-1) +/* 103 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 21025, "x-EBCDIC-CyrillicSerbianBulgarian,xebcdiccyrillicserbianbulgarian,", 00000, L"" }, // IBM EBCDIC (Cyrillic Serbian-Bulgarian) +/* 104 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50930, "x-EBCDIC-JapaneseAndKana,xebcdicjapaneseandkana,", 00000, L"" }, // IBM EBCDIC (Japanese and Japanese Katakana) +/* 105 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50931, "x-EBCDIC-JapaneseAndUSCanada,xebcdicjapaneseanduscanada,", 00000, L"" }, // IBM EBCDIC (Japanese and US-Canada) +/* 106 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50933, "x-EBCDIC-KoreanAndKoreanExtended,xebcdickoreanandkoreanextended,", 00000, L"" }, // IBM EBCDIC (Korean and Korean Extended) +/* 107 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50935, "x-EBCDIC-SimplifiedChinese,xebcdicsimplifiedchinese,", 00000, L"" }, // IBM EBCDIC (Chinese Simplified) +/* 108 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50937, "x-EBCDIC-TraditionalChinese,xebcdictraditionalchinese,", 00000, L"" }, // IBM EBCDIC (Chinese Traditional) +/* 109 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50939, "x-EBCDIC-JapaneseAndJapaneseLatin,xebcdicjapaneseandjapaneselatin,", 00000, L"" }, // IBM EBCDIC (Japanese and Japanese-Latin) +/* 110 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20105, "x-IA5,xia5,", 00000, L"" }, // Western European (IA5) +/* 111 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20106, "x-IA5-German,xia5german,", 00000, L"" }, // German (IA5) +/* 112 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20107, "x-IA5-Swedish,xia5swedish,", 00000, L"" }, // Swedish (IA5) +/* 113 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20108, "x-IA5-Norwegian,xia5norwegian,", 00000, L"" }, // Norwegian (IA5) +/* 114 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20936, "x-cp20936,xcp20936,", 00000, L"" }, // Chinese Simplified (GB2312) +/* 115 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20932, "euc-jp,,", 00000, L"" }, // Japanese (JIS X 0208-1990 & 0212-1990) +/* 116 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50220, "iso-2022-jp,iso2022jp,", 00000, L"" }, // Japanese (JIS) +/* 117 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50221, "csISO2022JP,csiso2022jp,", 00000, L"" }, // Japanese (JIS-Allow 1 byte Kana) +/* 118 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50222, "_iso-2022-jp$SIO,iso2022jpSIO,", 00000, L"" }, // Japanese (JIS-Allow 1 byte Kana - SO/SI) +/* 119 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50225, "iso-2022-kr,iso2022kr,csiso2022kr,", 00000, L"" }, // Korean (ISO-2022-KR) +/* 120 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50227, "x-cp50227,xcp50227,", 00000, L"" }, // Chinese Simplified (ISO-2022) +/* 121 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 50229, "iso-2022-cn,iso2022cn,", 00000, L"" }, // Chinese Traditional (ISO-2022) +/* 122 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20000, "x-Chinese-CNS,xchinesecns,", 00000, L"" }, // Chinese Traditional (CNS) +/* 123 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 20002, "x-Chinese-Eten,xchineseeten,", 00000, L"" }, // Chinese Traditional (Eten) +/* 125 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 51936, "euc-cn,euccn,xeuccn,", 00000, L"" }, // Chinese Simplified (EUC) +/* 127 *///{ NCP_EXTERNAL_8BIT|NCP_RECODE, 52936, "hz-gb-2312,hzgb2312,hz,", 00000, L"" }, // Chinese Simplified (HZ-GB2312) +/* 128 */ //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57002, "x-iscii-de,xisciide,", 00000, L"" }, // ISCII Devanagari +/* 129 */ //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57003, "x-iscii-be,xisciibe,", 00000, L"" }, // ISCII Bengali +/* 130 */ //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57004, "x-iscii-ta,xisciita,", 00000, L"" }, // ISCII Tamil +/* 131 */ //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57005, "x-iscii-te,xisciite,", 00000, L"" }, // ISCII Telugu +/* 132 */ //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57006, "x-iscii-as,xisciias,", 00000, L"" }, // ISCII Assamese +/* 133 */ //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57007, "x-iscii-or,xisciior,", 00000, L"" }, // ISCII Oriya +/* 134 */ //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57008, "x-iscii-ka,xisciika,", 00000, L"" }, // ISCII Kannada +/* 135 */ //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57009, "x-iscii-ma,xisciima,", 00000, L"" }, // ISCII Malayalam +/* 136 */ //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57010, "x-iscii-gu,xisciigu,", 00000, L"" }, // ISCII Gujarathi +/* 137 */ //{ NCP_EXTERNAL_8BIT|NCP_RECODE, 57011, "x-iscii-pa,xisciipa,", 00000, L"" }, // ISCII Panjabi }; int Encoding_CountOf() { return COUNTOF(g_Encodings); } +// ============================================================================ + int Encoding_Current(int iEncoding) { static int CurrentEncoding = CPI_NONE; @@ -207,6 +209,7 @@ int Encoding_Current(int iEncoding) { } return CurrentEncoding; } +// ============================================================================ int Encoding_SrcCmdLn(int iSrcEncoding) { @@ -223,6 +226,7 @@ int Encoding_SrcCmdLn(int iSrcEncoding) { } return SourceEncoding; } +// ============================================================================ int Encoding_SrcWeak(int iSrcWeakEnc) { @@ -239,6 +243,7 @@ int Encoding_SrcWeak(int iSrcWeakEnc) { } return SourceWeakEncoding; } +// ============================================================================ BOOL Encoding_HasChanged(int iOriginalEncoding) { @@ -252,6 +257,233 @@ BOOL Encoding_HasChanged(int iOriginalEncoding) { // ============================================================================ +// ============================================================================ +// ============================================================================ + +typedef unsigned short uint16_t; +typedef unsigned int uint32_t; + +typedef struct { + int encID; + uint16_t dbyte; + const char* encoding; +} freq_analysis_data_t; + + +static freq_analysis_data_t freq_analysis_data[] = +{ +{ 19, 0x9a74, "windows-1250" }, // "št" (Czech) +{ 19, 0xe865, "windows-1250" }, // "če" (Czech) +{ 19, 0xf865, "windows-1250" }, // "ře" (Czech) +{ 19, 0xe167, "windows-1250" }, // "ág" (Hungarian) +{ 19, 0xe96c, "windows-1250" }, // "él" (Hungarian) +{ 19, 0xb36f, "windows-1250" }, // "ło" (Polish) +{ 19, 0xea7a, "windows-1250" }, // "ęz" (Polish) +{ 19, 0xf377, "windows-1250" }, // "ów" (Polish) +{ 19, 0x9d20, "windows-1250" }, // "ť " (Slovak) +{ 19, 0xfa9d, "windows-1250" }, // "úť" (Slovak) +{ 19, 0x9e69, "windows-1250" }, // "ži" (Slovenian) +{ 19, 0xe869, "windows-1250" }, // "či" (Slovenian) +{ 67, 0xe020, "windows-1252" }, // "à " (French) +{ 67, 0xe920, "windows-1252" }, // "é " (French) +{ 67, 0xe963, "windows-1252" }, // "éc" (French) +{ 67, 0xe965, "windows-1252" }, // "ée" (French) +{ 67, 0xe972, "windows-1252" }, // "ér" (French) +{ 67, 0xe4e4, "windows-1252" }, // "ää" (Finnish) +{ 67, 0xe474, "windows-1252" }, // "ät" (German) +{ 67, 0xfc72, "windows-1252" }, // "ür" (German) +{ 67, 0xed6e, "windows-1252" }, // "ín" (Spanish) +{ 67, 0xf36e, "windows-1252" }, // "ón" (Spanish) +{ 52, 0x8220, "cp437" }, // "é " (French) +{ 52, 0x8263, "cp437" }, // "éc" (French) +{ 52, 0x8265, "cp437" }, // "ée" (French) +{ 52, 0x8272, "cp437" }, // "ér" (French) +{ 52, 0x8520, "cp437" }, // "à " (French) +{ 52, 0x8172, "cp437" }, // "ür" (German) +{ 52, 0x8474, "cp437" }, // "ät" (German) +{ 52, 0xc4c4, "cp437" }, // "──" +{ 52, 0xcdcd, "cp437" }, // "══" +{ 52, 0xdbdb, "cp437" }, // "██" +{ 72, 0xa1a1, "gbk" }, // " " +{ 72, 0xa1a2, "gbk" }, // "、" +{ 72, 0xa1a3, "gbk" }, // "。" +{ 72, 0xa1a4, "gbk" }, // "·" +{ 72, 0xa1b6, "gbk" }, // "《" +{ 72, 0xa1b7, "gbk" }, // "》" +{ 72, 0xa3ac, "gbk" }, // "," +{ 72, 0xa3ba, "gbk" }, // ":" +{ 72, 0xb5c4, "gbk" }, // "的" +{ 72, 0xc1cb, "gbk" }, // "了" +{ 72, 0xd2bb, "gbk" }, // "一" +{ 72, 0xcac7, "gbk" }, // "是" +{ 72, 0xb2bb, "gbk" }, // "不" +{ 72, 0xb8f6, "gbk" }, // "个" +{ 72, 0xc8cb, "gbk" }, // "人" +{ 72, 0xd5e2, "gbk" }, // "这" +{ 72, 0xd3d0, "gbk" }, // "有" +{ 72, 0xced2, "gbk" }, // "我" +{ 72, 0xc4e3, "gbk" }, // "你" +{ 72, 0xcbfb, "gbk" }, // "他" +{ 72, 0xcbfd, "gbk" }, // "她" +{ 72, 0xc9cf, "gbk" }, // "上" +{ 72, 0xbfb4, "gbk" }, // "看" +{ 72, 0xd6ae, "gbk" }, // "之" +{ 72, 0xbbb9, "gbk" }, // "还" +{ 72, 0xbfc9, "gbk" }, // "可" +{ 72, 0xbaf3, "gbk" }, // "后" +{ 72, 0xd6d0, "gbk" }, // "中" +{ 72, 0xd0d0, "gbk" }, // "行" +{ 72, 0xb1d2, "gbk" }, // "币" +{ 72, 0xb3f6, "gbk" }, // "出" +{ 72, 0xb7d1, "gbk" }, // "费" +{ 72, 0xb8d0, "gbk" }, // "感" +{ 72, 0xbef5, "gbk" }, // "觉" +{ 72, 0xc4ea, "gbk" }, // "年" +{ 72, 0xd4c2, "gbk" }, // "月" +{ 72, 0xc8d5, "gbk" }, // "日" +{ 22, 0xa140, "big5" }, // " " +{ 22, 0xa141, "big5" }, // "," +{ 22, 0xa143, "big5" }, // "。" +{ 22, 0xa147, "big5" }, // ":" +{ 22, 0xaaba, "big5" }, // "的" +{ 22, 0xa446, "big5" }, // "了" +{ 22, 0xa440, "big5" }, // "一" +{ 22, 0xac4f, "big5" }, // "是" +{ 22, 0xa4a3, "big5" }, // "不" +{ 22, 0xa448, "big5" }, // "人" +{ 22, 0xa7da, "big5" }, // "我" +{ 22, 0xa741, "big5" }, // "你" +{ 22, 0xa54c, "big5" }, // "他" +{ 22, 0xa66f, "big5" }, // "她" +{ 22, 0xadd3, "big5" }, // "個" +{ 22, 0xa457, "big5" }, // "上" +{ 22, 0xa662, "big5" }, // "在" +{ 22, 0xbba1, "big5" }, // "說" +{ 22, 0xa65e, "big5" }, // "回" +{ 46, 0x8140, "sjis" }, // " " +{ 46, 0x8141, "sjis" }, // "、" +{ 46, 0x8142, "sjis" }, // "。" +{ 46, 0x8145, "sjis" }, // "・" +{ 46, 0x8146, "sjis" }, // ":" +{ 46, 0x815b, "sjis" }, // "ー" +{ 46, 0x82b5, "sjis" }, // "し" +{ 46, 0x82bd, "sjis" }, // "た" +{ 46, 0x82c8, "sjis" }, // "な" +{ 46, 0x82c9, "sjis" }, // "に" +{ 46, 0x82cc, "sjis" }, // "の" +{ 46, 0x82dc, "sjis" }, // "ま" +{ 46, 0x82f0, "sjis" }, // "を" +{ 46, 0x8367, "sjis" }, // "ト" +{ 46, 0x8393, "sjis" }, // "ン" +{ 46, 0x89ef, "sjis" }, // "会" +{ 46, 0x906c, "sjis" }, // "人" +{ 46, 0x9094, "sjis" }, // "数" +{ 46, 0x93fa, "sjis" }, // "日" +{ 46, 0x95f1, "sjis" }, // "報" +{ 73, 0xa1bc, "euc-jp" }, // "ー" +{ 73, 0xa4bf, "euc-jp" }, // "た" +{ 73, 0xa4ca, "euc-jp" }, // "な" +{ 73, 0xa4cb, "euc-jp" }, // "に" +{ 73, 0xa4ce, "euc-jp" }, // "の" +{ 73, 0xa4de, "euc-jp" }, // "ま" +{ 73, 0xa4f2, "euc-jp" }, // "を" +{ 73, 0xa5c8, "euc-jp" }, // "ト" +{ 73, 0xa5f3, "euc-jp" }, // "ン" +{ 73, 0xb2f1, "euc-jp" }, // "会" +{ 73, 0xbfcd, "euc-jp" }, // "人" +{ 73, 0xbff4, "euc-jp" }, // "数" +{ 73, 0xc6fc, "euc-jp" }, // "日" +{ 73, 0xcaf3, "euc-jp" }, // "報" +{ 74, 0xc0cc, "euc-kr" }, // "이" +{ 74, 0xb0fa, "euc-kr" }, // "과" +{ 74, 0xb1e2, "euc-kr" }, // "기" +{ 74, 0xb4c2, "euc-kr" }, // "는" +{ 74, 0xb7ce, "euc-kr" }, // "로" +{ 74, 0xb1db, "euc-kr" }, // "글" +{ 74, 0xc5e4, "euc-kr" }, // "토" +{ 74, 0xc1a4, "euc-kr" }, // "정" +{ 27, 0xc920, "koi8-r" }, // "и " +{ 27, 0xc7cf, "koi8-r" }, // "го" +{ 27, 0xcbcf, "koi8-r" }, // "ко" +{ 27, 0xd3cb, "koi8-r" }, // "ск" +{ 27, 0xd3d4, "koi8-r" }, // "ст" +{ 28, 0xa6a7, "koi8-u" }, // "ії" +{ 28, 0xa6ce, "koi8-u" }, // "ін" +{ 28, 0xa6d7, "koi8-u" }, // "ів" +{ 28, 0xa7ce, "koi8-u" }, // "їн" +{ 28, 0xd0cf, "koi8-u" }, // "по" +{ 28, 0xd4c9, "koi8-u" }, // "ти" +}; +// ============================================================================ + +typedef struct _char_count_t { + uint16_t first; + uint32_t second; +} char_count_t; + +//typedef pair char_count_t; +//typedef map char_count_map_t; +//typedef vector char_count_vec_t; + + +int __fastcall check_freq_dbyte(uint16_t dbyte) +{ + for (size_t i = 0; i < sizeof freq_analysis_data / sizeof(freq_analysis_data_t); ++i) { + if (dbyte == freq_analysis_data[i].dbyte) { + return freq_analysis_data[i].encID; + } + } + return CPI_NONE; +} +// ============================================================================ + + +int __fastcall search_freq_dbytes(const UT_array* dbyte_char_cnt) +{ + size_t max_comp_idx = 10; + if (max_comp_idx > utarray_len(dbyte_char_cnt)) { + max_comp_idx = utarray_len(dbyte_char_cnt); + } + for (size_t i = 0; i < max_comp_idx; ++i) { + + const char_count_t* ccnt = (char_count_t*)utarray_eltptr(dbyte_char_cnt, i); + + const int enc = check_freq_dbyte(ccnt->first); + if (enc > CPI_NONE) { + return enc; + } + } + return CPI_NONE; +} +// ============================================================================ + + + +int Encoding_TellEncoding(const unsigned char* const buffer, const size_t len) +{ + int iEncoding = CPI_NONE; + UT_icd char_count_icd = { sizeof(char_count_t), NULL, NULL, NULL }; + UT_array* char_count_vector = NULL; + + utarray_new(char_count_vector, &char_count_icd); + utarray_reserve(char_count_vector, 256); + + ///... + + utarray_clear(char_count_vector); + utarray_free(char_count_vector); + + UNUSED(buffer); + UNUSED(len); + return iEncoding; +} +// ============================================================================ + + + +// ============================================================================ +// ============================================================================ + void Encoding_InitDefaults() { @@ -307,6 +539,7 @@ void Encoding_InitDefaults() } } } + } // ============================================================================ @@ -891,11 +1124,12 @@ BOOL IsUTF8(const char* pTest, int nLength) break; } - return (current == kSTART) ? TRUE : FALSE; + return (current == kSTART) ? true : false; } // ============================================================================ + BOOL IsUTF7(const char* pTest, int nLength) { int i; const char *pt = pTest; @@ -1058,194 +1292,22 @@ INT UTF8_mbslen(LPCSTR source, INT byte_length) * @author Wu Yongwei */ - -typedef unsigned short uint16_t; -typedef unsigned int uint32_t; - -typedef struct _char_count_t { - uint16_t first; - uint32_t second; -} char_count_t; - -//typedef pair char_count_t; -//typedef map char_count_map_t; -//typedef vector char_count_vec_t; - - -typedef struct _freq_analysis_data_t { - uint16_t dbyte; - const char* enc; -} freq_analysis_data_t; - - - -typedef enum { - UTF8_INVALID, - UTF8_1, - UTF8_2, - UTF8_3, - UTF8_4, - UTF8_TAIL -} UTF8_State; - - #define MAX_CHAR 256 + + + + + static const unsigned char NON_TEXT_CHARS[] = { 0, 26, 127, 255 }; static const char NUL = '\0'; static const char DOS_EOF = '\x1A'; static const int EVEN = 0; static const int ODD = 1; -static UTF8_State utf8_char_table[MAX_CHAR]; // ============================================================================ -static freq_analysis_data_t freq_analysis_data[] = { - { 0x9a74, "windows-1250" }, // "št" (Czech) - { 0xe865, "windows-1250" }, // "če" (Czech) - { 0xf865, "windows-1250" }, // "ře" (Czech) - { 0xe167, "windows-1250" }, // "ág" (Hungarian) - { 0xe96c, "windows-1250" }, // "él" (Hungarian) - { 0xb36f, "windows-1250" }, // "ło" (Polish) - { 0xea7a, "windows-1250" }, // "ęz" (Polish) - { 0xf377, "windows-1250" }, // "ów" (Polish) - { 0x9d20, "windows-1250" }, // "ť " (Slovak) - { 0xfa9d, "windows-1250" }, // "úť" (Slovak) - { 0x9e69, "windows-1250" }, // "ži" (Slovenian) - { 0xe869, "windows-1250" }, // "či" (Slovenian) - { 0xe020, "windows-1252" }, // "à " (French) - { 0xe920, "windows-1252" }, // "é " (French) - { 0xe963, "windows-1252" }, // "éc" (French) - { 0xe965, "windows-1252" }, // "ée" (French) - { 0xe972, "windows-1252" }, // "ér" (French) - { 0xe4e4, "windows-1252" }, // "ää" (Finnish) - { 0xe474, "windows-1252" }, // "ät" (German) - { 0xfc72, "windows-1252" }, // "ür" (German) - { 0xed6e, "windows-1252" }, // "ín" (Spanish) - { 0xf36e, "windows-1252" }, // "ón" (Spanish) - { 0x8220, "cp437" }, // "é " (French) - { 0x8263, "cp437" }, // "éc" (French) - { 0x8265, "cp437" }, // "ée" (French) - { 0x8272, "cp437" }, // "ér" (French) - { 0x8520, "cp437" }, // "à " (French) - { 0x8172, "cp437" }, // "ür" (German) - { 0x8474, "cp437" }, // "ät" (German) - { 0xc4c4, "cp437" }, // "──" - { 0xcdcd, "cp437" }, // "══" - { 0xdbdb, "cp437" }, // "██" - { 0xa1a1, "gbk" }, // " " - { 0xa1a2, "gbk" }, // "、" - { 0xa1a3, "gbk" }, // "。" - { 0xa1a4, "gbk" }, // "·" - { 0xa1b6, "gbk" }, // "《" - { 0xa1b7, "gbk" }, // "》" - { 0xa3ac, "gbk" }, // "," - { 0xa3ba, "gbk" }, // ":" - { 0xb5c4, "gbk" }, // "的" - { 0xc1cb, "gbk" }, // "了" - { 0xd2bb, "gbk" }, // "一" - { 0xcac7, "gbk" }, // "是" - { 0xb2bb, "gbk" }, // "不" - { 0xb8f6, "gbk" }, // "个" - { 0xc8cb, "gbk" }, // "人" - { 0xd5e2, "gbk" }, // "这" - { 0xd3d0, "gbk" }, // "有" - { 0xced2, "gbk" }, // "我" - { 0xc4e3, "gbk" }, // "你" - { 0xcbfb, "gbk" }, // "他" - { 0xcbfd, "gbk" }, // "她" - { 0xc9cf, "gbk" }, // "上" - { 0xbfb4, "gbk" }, // "看" - { 0xd6ae, "gbk" }, // "之" - { 0xbbb9, "gbk" }, // "还" - { 0xbfc9, "gbk" }, // "可" - { 0xbaf3, "gbk" }, // "后" - { 0xd6d0, "gbk" }, // "中" - { 0xd0d0, "gbk" }, // "行" - { 0xb1d2, "gbk" }, // "币" - { 0xb3f6, "gbk" }, // "出" - { 0xb7d1, "gbk" }, // "费" - { 0xb8d0, "gbk" }, // "感" - { 0xbef5, "gbk" }, // "觉" - { 0xc4ea, "gbk" }, // "年" - { 0xd4c2, "gbk" }, // "月" - { 0xc8d5, "gbk" }, // "日" - { 0xa140, "big5" }, // " " - { 0xa141, "big5" }, // "," - { 0xa143, "big5" }, // "。" - { 0xa147, "big5" }, // ":" - { 0xaaba, "big5" }, // "的" - { 0xa446, "big5" }, // "了" - { 0xa440, "big5" }, // "一" - { 0xac4f, "big5" }, // "是" - { 0xa4a3, "big5" }, // "不" - { 0xa448, "big5" }, // "人" - { 0xa7da, "big5" }, // "我" - { 0xa741, "big5" }, // "你" - { 0xa54c, "big5" }, // "他" - { 0xa66f, "big5" }, // "她" - { 0xadd3, "big5" }, // "個" - { 0xa457, "big5" }, // "上" - { 0xa662, "big5" }, // "在" - { 0xbba1, "big5" }, // "說" - { 0xa65e, "big5" }, // "回" - { 0x8140, "sjis" }, // " " - { 0x8141, "sjis" }, // "、" - { 0x8142, "sjis" }, // "。" - { 0x8145, "sjis" }, // "・" - { 0x8146, "sjis" }, // ":" - { 0x815b, "sjis" }, // "ー" - { 0x82b5, "sjis" }, // "し" - { 0x82bd, "sjis" }, // "た" - { 0x82c8, "sjis" }, // "な" - { 0x82c9, "sjis" }, // "に" - { 0x82cc, "sjis" }, // "の" - { 0x82dc, "sjis" }, // "ま" - { 0x82f0, "sjis" }, // "を" - { 0x8367, "sjis" }, // "ト" - { 0x8393, "sjis" }, // "ン" - { 0x89ef, "sjis" }, // "会" - { 0x906c, "sjis" }, // "人" - { 0x9094, "sjis" }, // "数" - { 0x93fa, "sjis" }, // "日" - { 0x95f1, "sjis" }, // "報" - { 0xa1bc, "euc-jp" }, // "ー" - { 0xa4bf, "euc-jp" }, // "た" - { 0xa4ca, "euc-jp" }, // "な" - { 0xa4cb, "euc-jp" }, // "に" - { 0xa4ce, "euc-jp" }, // "の" - { 0xa4de, "euc-jp" }, // "ま" - { 0xa4f2, "euc-jp" }, // "を" - { 0xa5c8, "euc-jp" }, // "ト" - { 0xa5f3, "euc-jp" }, // "ン" - { 0xb2f1, "euc-jp" }, // "会" - { 0xbfcd, "euc-jp" }, // "人" - { 0xbff4, "euc-jp" }, // "数" - { 0xc6fc, "euc-jp" }, // "日" - { 0xcaf3, "euc-jp" }, // "報" - { 0xc0cc, "euc-kr" }, // "이" - { 0xb0fa, "euc-kr" }, // "과" - { 0xb1e2, "euc-kr" }, // "기" - { 0xb4c2, "euc-kr" }, // "는" - { 0xb7ce, "euc-kr" }, // "로" - { 0xb1db, "euc-kr" }, // "글" - { 0xc5e4, "euc-kr" }, // "토" - { 0xc1a4, "euc-kr" }, // "정" - { 0xc920, "koi8-r" }, // "и " - { 0xc7cf, "koi8-r" }, // "го" - { 0xcbcf, "koi8-r" }, // "ко" - { 0xd3cb, "koi8-r" }, // "ск" - { 0xd3d4, "koi8-r" }, // "ст" - { 0xa6a7, "koi8-u" }, // "ії" - { 0xa6ce, "koi8-u" }, // "ін" - { 0xa6d7, "koi8-u" }, // "ів" - { 0xa7ce, "koi8-u" }, // "їн" - { 0xd0cf, "koi8-u" }, // "по" - { 0xd4c9, "koi8-u" }, // "ти" -}; -// ============================================================================ - static size_t nul_count_byte[2]; static size_t nul_count_word[2]; @@ -1274,36 +1336,6 @@ static inline bool is_non_text(char ch) -void init_utf8_char_table() -{ - int ch = 0; - utf8_char_table[ch] = UTF8_INVALID; - ++ch; - for (; ch <= 0x7f; ++ch) { - utf8_char_table[ch] = UTF8_1; - } - for (; ch <= 0xbf; ++ch) { - utf8_char_table[ch] = UTF8_TAIL; - } - for (; ch <= 0xc1; ++ch) { - utf8_char_table[ch] = UTF8_INVALID; - } - for (; ch <= 0xdf; ++ch) { - utf8_char_table[ch] = UTF8_2; - } - for (; ch <= 0xef; ++ch) { - utf8_char_table[ch] = UTF8_3; - } - for (; ch <= 0xf4; ++ch) { - utf8_char_table[ch] = UTF8_4; - } - for (; ch <= 0xff; ++ch) { - utf8_char_table[ch] = UTF8_INVALID; - } -} -// ============================================================================ - - static void init_sbyte_char_count(char_count_t sbyte_char_cnt[]) { @@ -1350,36 +1382,6 @@ static const char* check_ucs_bom(const unsigned char* const buffer, const size_t -static const char* check_freq_dbyte(uint16_t dbyte) -{ - for (size_t i = 0; - i < sizeof freq_analysis_data / sizeof(freq_analysis_data_t); - ++i) { - if (dbyte == freq_analysis_data[i].dbyte) { - return freq_analysis_data[i].enc; - } - } - return NULL; -} -// ============================================================================ - - - -static const char* search_freq_dbytes(const char_count_vec_t* dbyte_char_cnt) -{ - size_t max_comp_idx = 10; - if (max_comp_idx > dbyte_char_cnt->size()) { - max_comp_idx = dbyte_char_cnt->size(); - } - for (size_t i = 0; i < max_comp_idx; ++i) { - const char* enc = check_freq_dbyte(dbyte_char_cnt[i].first); - if (enc) { - return enc; - } - } - return NULL; -} -// ============================================================================ @@ -1562,11 +1564,6 @@ static bool bInitDone = false; int GetBufferEncoding(const char* const buffer, const size_t len) { - if (!bInitDone) { - init_utf8_char_table(); - bInitDone = true; - } - const char* enc = tellenc_simplify(buffer, len); if (enc) diff --git a/src/Notepad3.rc b/src/Notepad3.rc index 3a796c651..ad5c7cc41 100644 --- a/src/Notepad3.rc +++ b/src/Notepad3.rc @@ -1659,6 +1659,8 @@ END STRINGTABLE BEGIN 61072 "C;Chinese Simplified (GB18030);GB18030" + 61073 "C;Japanese (EUC);Japanese (EUC)" + 61074 "C;Korean (EUC);Korean (EUC)" END STRINGTABLE From 98c919dfcfec942e43857206e313b76f7bcdca22 Mon Sep 17 00:00:00 2001 From: Rainer Kottenhoff Date: Fri, 2 Mar 2018 16:13:34 +0100 Subject: [PATCH 6/6] + fix: integration of "tellenc" encoding detection ideas --- src/Edit.c | 56 ++-- src/Encoding.c | 815 ++++++++++++++++++++++++------------------------ src/Encoding.h | 11 +- src/Helpers.c | 4 +- src/Helpers.h | 11 +- src/Notepad3.rc | 14 +- 6 files changed, 459 insertions(+), 452 deletions(-) diff --git a/src/Edit.c b/src/Edit.c index b6f456b79..5b5070411 100644 --- a/src/Edit.c +++ b/src/Edit.c @@ -1064,9 +1064,19 @@ BOOL EditLoadFile( } const int iForcedEncoding = Encoding_SrcCmdLn(CPI_GET); - const int iFileEncWeak = (Encoding_SrcWeak(CPI_GET) != CPI_NONE) ? Encoding_SrcWeak(CPI_GET) : CPI_ANSI_DEFAULT; - const int iPreferedEncoding = (bPreferOEM) ? g_DOSEncoding : (bUseDefaultForFileEncoding ? g_iDefaultNewFileEncoding : iFileEncWeak); - //@@@ Encoding_IsINTERNAL(iFileEncWeak) ? g_iDefaultNewFileEncoding : iFileEncWeak; + const int iFileEncWeak = Encoding_SrcWeak(CPI_GET); + const int iAnalyzedEncoding = !bSkipEncodingDetection ? Encoding_Analyze(lpData, cbData) : CPI_NONE; + + // choose best encoding guess + int iPreferedEncoding = (bPreferOEM) ? g_DOSEncoding : (bUseDefaultForFileEncoding ? g_iDefaultNewFileEncoding : CPI_ANSI_DEFAULT); + + if (iForcedEncoding != CPI_NONE) + iPreferedEncoding = iForcedEncoding; + else if (iFileEncWeak != CPI_NONE) + iPreferedEncoding = iFileEncWeak; + else if (iAnalyzedEncoding != CPI_NONE) + iPreferedEncoding = iAnalyzedEncoding; + BOOL bBOM = FALSE; BOOL bReverse = FALSE; @@ -1088,10 +1098,11 @@ BOOL EditLoadFile( SendMessage(hwnd,SCI_SETEOLMODE,iLineEndings[g_iDefaultEOLMode],0); GlobalFree(lpData); } - else if (!bSkipEncodingDetection && - (iForcedEncoding == CPI_NONE || iForcedEncoding == CPI_UNICODE || iForcedEncoding == CPI_UNICODEBE) && - (iForcedEncoding == CPI_UNICODE || iForcedEncoding == CPI_UNICODEBE || IsUnicode(lpData,cbData,&bBOM,&bReverse)) && - (iForcedEncoding == CPI_UNICODE || iForcedEncoding == CPI_UNICODEBE || !IsUTF8Signature(lpData))) // check for UTF-8 signature + // === UNICODE === + else if (!bSkipEncodingDetection && //TODO: use Encoding_IsUNICODE(iAnalyzedEncoding) here ??? + (Encoding_IsUNICODE(iForcedEncoding) || (iForcedEncoding == CPI_NONE)) && + (Encoding_IsUNICODE(iForcedEncoding) || IsUnicode(lpData,cbData,&bBOM,&bReverse)) && + (Encoding_IsUNICODE(iForcedEncoding) || !IsUTF8Signature(lpData))) // check for UTF-8 signature { char* lpDataUTF8; @@ -1147,19 +1158,20 @@ BOOL EditLoadFile( } } - else { + else { // === ALL OTHERS === + FileVars_Init(lpData,cbData,&fvCurFile); - if (!bSkipEncodingDetection && (iForcedEncoding == CPI_NONE || iForcedEncoding == CPI_UTF8 || iForcedEncoding == CPI_UTF8SIGN) && - ((IsUTF8Signature(lpData) || - FileVars_IsUTF8(&fvCurFile) || - (iForcedEncoding == CPI_UTF8 || iForcedEncoding == CPI_UTF8SIGN) || - (!bPreferOEM && bLoadASCIIasUTF8) || // from menu "Reload As UTF-8" - (IsUTF8(lpData,cbData) && - (((UTF8_mbslen_bytes(UTF8StringStart(lpData)) - 1 != - UTF8_mbslen(UTF8StringStart(lpData),IsUTF8Signature(lpData) ? cbData-3 : cbData)) || - (!bPreferOEM && ( - Encoding_IsUTF8(iPreferedEncoding) || bLoadASCIIasUTF8))))))) && !(FileVars_IsNonUTF8(&fvCurFile) && - (iForcedEncoding != CPI_UTF8 && iForcedEncoding != CPI_UTF8SIGN))) + + // === UTF-8 === + if (!bSkipEncodingDetection && (Encoding_IsNONE(iForcedEncoding) || Encoding_IsUTF8(iForcedEncoding)) && + ((IsUTF8Signature(lpData) || + FileVars_IsUTF8(&fvCurFile) || + (Encoding_IsUTF8(iForcedEncoding) || + (!bPreferOEM && bLoadASCIIasUTF8) || // from menu "Reload As UTF-8" + (IsUTF8(lpData,cbData) && + (((UTF8_mbslen_bytes(UTF8StringStart(lpData)) - 1 != UTF8_mbslen(UTF8StringStart(lpData),IsUTF8Signature(lpData) ? cbData-3 : cbData)) || + (!bPreferOEM && (Encoding_IsUTF8(iPreferedEncoding) || bLoadASCIIasUTF8))))))) && + !(FileVars_IsNonUTF8(&fvCurFile) && !Encoding_IsUTF8(iForcedEncoding)))) { Encoding_SciSetCodePage(hwnd,CPI_UTF8); EditSetNewText(hwnd,"",0); @@ -1176,13 +1188,13 @@ BOOL EditLoadFile( GlobalFree(lpData); } - else { + else { // === ALL OTHER === - if (iForcedEncoding != CPI_NONE) + if (!Encoding_IsNONE(iForcedEncoding)) *iEncoding = iForcedEncoding; else { *iEncoding = FileVars_GetEncoding(&fvCurFile); - if (*iEncoding == CPI_NONE) { + if (Encoding_IsNONE(*iEncoding)) { if (fvCurFile.mask & FV_ENCODING) *iEncoding = CPI_ANSI_DEFAULT; else { diff --git a/src/Encoding.c b/src/Encoding.c index 0d6bba481..fe7268cf7 100644 --- a/src/Encoding.c +++ b/src/Encoding.c @@ -3,11 +3,11 @@ * * * Notepad3 * * * -* Encoding.c * -* General helper functions * +* Encoding.c * +* Handling and Helpers for File Encoding * * Based on code from Notepad2, (c) Florian Balmer 1996-2011 * -* Parts taken from SciTE, (c) Neil Hodgson * -* MinimizeToTray, (c) 2000 Matthew Ellis * +* * +* * * * * (c) Rizonesoft 2015-2018 * * https://rizonesoft.com * @@ -28,7 +28,7 @@ #include #include -#include +#include #include "../uthash/utarray.h" @@ -219,7 +219,7 @@ int Encoding_SrcCmdLn(int iSrcEncoding) { if (Encoding_IsValid(iSrcEncoding)) SourceEncoding = iSrcEncoding; else - SourceEncoding = CPI_UTF8; + SourceEncoding = CPI_ANSI_DEFAULT; } else if (iSrcEncoding == CPI_NONE) { SourceEncoding = CPI_NONE; @@ -260,9 +260,53 @@ BOOL Encoding_HasChanged(int iOriginalEncoding) { // ============================================================================ // ============================================================================ +/* +* Mostly taken from "tellenc" +* Program to detect the encoding of text. It currently supports ASCII, +* UTF-8, UTF-16/32 (little-endian or big-endian), Latin1, Windows-1252, +* CP437, GB2312, GBK, Big5, and SJIS, among others. +* +* Copyright (C) 2006-2016 Wu Yongwei +* +* This software is provided 'as-is', without any express or implied +* warranty. In no event will the authors be held liable for any +* damages arising from the use of this software. +* +* Permission is granted to anyone to use this software for any purpose, +* including commercial applications, and to alter it and redistribute +* it freely, subject to the following restrictions: +* +* 1. The origin of this software must not be misrepresented; you must +* not claim that you wrote the original software. If you use this +* software in a product, an acknowledgment in the product +* documentation would be appreciated but is not required. +* 2. Altered source versions must be plainly marked as such, and must +* not be misrepresented as being the original software. +* 3. This notice may not be removed or altered from any source +* distribution. +* +* +* The latest version of this software should be available at: +* +* +*/ + + typedef unsigned short uint16_t; typedef unsigned int uint32_t; + +typedef enum _UTF8_ValidationState +{ + UTF8_INVALID, + UTF8_1, + UTF8_2, + UTF8_3, + UTF8_4, + UTF8_TAIL +} UTF8_ValidationState; + + typedef struct { int encID; uint16_t dbyte; @@ -284,6 +328,13 @@ static freq_analysis_data_t freq_analysis_data[] = { 19, 0xfa9d, "windows-1250" }, // "úť" (Slovak) { 19, 0x9e69, "windows-1250" }, // "ži" (Slovenian) { 19, 0xe869, "windows-1250" }, // "či" (Slovenian) + +{ 30, 0xe820, "windows-1251" }, // "и " (Cyrillic) +{ 30, 0xe3ee, "windows-1251" }, // "го" (Cyrillic) +{ 30, 0xeaee, "windows-1251" }, // "ко" (Cyrillic) +{ 30, 0xf1ea, "windows-1251" }, // "ск" (Cyrillic) +{ 30, 0xf1f2, "windows-1251" }, // "ст" (Cyrillic) + { 67, 0xe020, "windows-1252" }, // "à " (French) { 67, 0xe920, "windows-1252" }, // "é " (French) { 67, 0xe963, "windows-1252" }, // "éc" (French) @@ -304,43 +355,43 @@ static freq_analysis_data_t freq_analysis_data[] = { 52, 0xc4c4, "cp437" }, // "──" { 52, 0xcdcd, "cp437" }, // "══" { 52, 0xdbdb, "cp437" }, // "██" -{ 72, 0xa1a1, "gbk" }, // " " -{ 72, 0xa1a2, "gbk" }, // "、" -{ 72, 0xa1a3, "gbk" }, // "。" -{ 72, 0xa1a4, "gbk" }, // "·" -{ 72, 0xa1b6, "gbk" }, // "《" -{ 72, 0xa1b7, "gbk" }, // "》" -{ 72, 0xa3ac, "gbk" }, // "," -{ 72, 0xa3ba, "gbk" }, // ":" -{ 72, 0xb5c4, "gbk" }, // "的" -{ 72, 0xc1cb, "gbk" }, // "了" -{ 72, 0xd2bb, "gbk" }, // "一" -{ 72, 0xcac7, "gbk" }, // "是" -{ 72, 0xb2bb, "gbk" }, // "不" -{ 72, 0xb8f6, "gbk" }, // "个" -{ 72, 0xc8cb, "gbk" }, // "人" -{ 72, 0xd5e2, "gbk" }, // "这" -{ 72, 0xd3d0, "gbk" }, // "有" -{ 72, 0xced2, "gbk" }, // "我" -{ 72, 0xc4e3, "gbk" }, // "你" -{ 72, 0xcbfb, "gbk" }, // "他" -{ 72, 0xcbfd, "gbk" }, // "她" -{ 72, 0xc9cf, "gbk" }, // "上" -{ 72, 0xbfb4, "gbk" }, // "看" -{ 72, 0xd6ae, "gbk" }, // "之" -{ 72, 0xbbb9, "gbk" }, // "还" -{ 72, 0xbfc9, "gbk" }, // "可" -{ 72, 0xbaf3, "gbk" }, // "后" -{ 72, 0xd6d0, "gbk" }, // "中" -{ 72, 0xd0d0, "gbk" }, // "行" -{ 72, 0xb1d2, "gbk" }, // "币" -{ 72, 0xb3f6, "gbk" }, // "出" -{ 72, 0xb7d1, "gbk" }, // "费" -{ 72, 0xb8d0, "gbk" }, // "感" -{ 72, 0xbef5, "gbk" }, // "觉" -{ 72, 0xc4ea, "gbk" }, // "年" -{ 72, 0xd4c2, "gbk" }, // "月" -{ 72, 0xc8d5, "gbk" }, // "日" +{ 20, 0xa1a1, "gbk" }, // " " +{ 20, 0xa1a2, "gbk" }, // "、" +{ 20, 0xa1a3, "gbk" }, // "。" +{ 20, 0xa1a4, "gbk" }, // "·" +{ 20, 0xa1b6, "gbk" }, // "《" +{ 20, 0xa1b7, "gbk" }, // "》" +{ 20, 0xa3ac, "gbk" }, // "," +{ 20, 0xa3ba, "gbk" }, // ":" +{ 20, 0xb5c4, "gbk" }, // "的" +{ 20, 0xc1cb, "gbk" }, // "了" +{ 20, 0xd2bb, "gbk" }, // "一" +{ 20, 0xcac7, "gbk" }, // "是" +{ 20, 0xb2bb, "gbk" }, // "不" +{ 20, 0xb8f6, "gbk" }, // "个" +{ 20, 0xc8cb, "gbk" }, // "人" +{ 20, 0xd5e2, "gbk" }, // "这" +{ 20, 0xd3d0, "gbk" }, // "有" +{ 20, 0xced2, "gbk" }, // "我" +{ 20, 0xc4e3, "gbk" }, // "你" +{ 20, 0xcbfb, "gbk" }, // "他" +{ 20, 0xcbfd, "gbk" }, // "她" +{ 20, 0xc9cf, "gbk" }, // "上" +{ 20, 0xbfb4, "gbk" }, // "看" +{ 20, 0xd6ae, "gbk" }, // "之" +{ 20, 0xbbb9, "gbk" }, // "还" +{ 20, 0xbfc9, "gbk" }, // "可" +{ 20, 0xbaf3, "gbk" }, // "后" +{ 20, 0xd6d0, "gbk" }, // "中" +{ 20, 0xd0d0, "gbk" }, // "行" +{ 20, 0xb1d2, "gbk" }, // "币" +{ 20, 0xb3f6, "gbk" }, // "出" +{ 20, 0xb7d1, "gbk" }, // "费" +{ 20, 0xb8d0, "gbk" }, // "感" +{ 20, 0xbef5, "gbk" }, // "觉" +{ 20, 0xc4ea, "gbk" }, // "年" +{ 20, 0xd4c2, "gbk" }, // "月" +{ 20, 0xc8d5, "gbk" }, // "日" { 22, 0xa140, "big5" }, // " " { 22, 0xa141, "big5" }, // "," { 22, 0xa143, "big5" }, // "。" @@ -416,19 +467,16 @@ static freq_analysis_data_t freq_analysis_data[] = }; // ============================================================================ -typedef struct _char_count_t { - uint16_t first; - uint32_t second; -} char_count_t; - -//typedef pair char_count_t; -//typedef map char_count_map_t; -//typedef vector char_count_vec_t; +#define MAX_CHAR 256 +typedef struct _dbyte_cnt_t { + uint16_t dblByte; + uint32_t count; +} dbyte_cnt_t; int __fastcall check_freq_dbyte(uint16_t dbyte) { - for (size_t i = 0; i < sizeof freq_analysis_data / sizeof(freq_analysis_data_t); ++i) { + for (size_t i = 0; i < (sizeof freq_analysis_data / sizeof(freq_analysis_data_t)); ++i) { if (dbyte == freq_analysis_data[i].dbyte) { return freq_analysis_data[i].encID; } @@ -437,18 +485,20 @@ int __fastcall check_freq_dbyte(uint16_t dbyte) } // ============================================================================ - -int __fastcall search_freq_dbytes(const UT_array* dbyte_char_cnt) +// -------------------------------------------------------------- +// arg dbyte_cnt_map must be sorted (high count first) +// +int __fastcall search_freq_dbytes(const UT_array* dbyte_cnt_map) { - size_t max_comp_idx = 10; - if (max_comp_idx > utarray_len(dbyte_char_cnt)) { - max_comp_idx = utarray_len(dbyte_char_cnt); - } - for (size_t i = 0; i < max_comp_idx; ++i) { + size_t max_comp_cnt = 10; + size_t cnt = 0; - const char_count_t* ccnt = (char_count_t*)utarray_eltptr(dbyte_char_cnt, i); + for (dbyte_cnt_t* p = (dbyte_cnt_t*)utarray_front(dbyte_cnt_map); + (p != NULL) && (++cnt <= max_comp_cnt); + p = (dbyte_cnt_t*)utarray_next(dbyte_cnt_map, p)) { + + const int enc = check_freq_dbyte(p->dblByte); - const int enc = check_freq_dbyte(ccnt->first); if (enc > CPI_NONE) { return enc; } @@ -458,35 +508,300 @@ int __fastcall search_freq_dbytes(const UT_array* dbyte_char_cnt) // ============================================================================ +static UTF8_ValidationState utf8_char_table[MAX_CHAR]; -int Encoding_TellEncoding(const unsigned char* const buffer, const size_t len) +void init_utf8_validation_char_table() +{ + int ch = 0; + utf8_char_table[ch] = UTF8_INVALID; + ++ch; + for (; ch <= 0x7f; ++ch) { + utf8_char_table[ch] = UTF8_1; + } + for (; ch <= 0xbf; ++ch) { + utf8_char_table[ch] = UTF8_TAIL; + } + for (; ch <= 0xc1; ++ch) { + utf8_char_table[ch] = UTF8_INVALID; + } + for (; ch <= 0xdf; ++ch) { + utf8_char_table[ch] = UTF8_2; + } + for (; ch <= 0xef; ++ch) { + utf8_char_table[ch] = UTF8_3; + } + for (; ch <= 0xf4; ++ch) { + utf8_char_table[ch] = UTF8_4; + } + for (; ch <= 0xff; ++ch) { + utf8_char_table[ch] = UTF8_INVALID; + } +} +// ============================================================================ + + +void __fastcall init_sbyte_char_count(dbyte_cnt_t sbyte_char_cnt[]) +{ + for (size_t ch = 0; ch < MAX_CHAR; ++ch) { + sbyte_char_cnt[ch].dblByte = (uint16_t)ch; + sbyte_char_cnt[ch].count = 0; + } +} +// ============================================================================ + +static const unsigned char NON_TEXT_CHARS[] = { 0, 26, 127, 255 }; + +__forceinline bool is_non_text(char ch) +{ + for (size_t i = 0; i < sizeof(NON_TEXT_CHARS); ++i) { + if (ch == NON_TEXT_CHARS[i]) { + return true; + } + } + return false; +} +// ============================================================================ + + +__forceinline dbyte_cnt_t* find_dbyte_count(const UT_array* const dbyte_cnt_map, const uint16_t dbyte) +{ + for (dbyte_cnt_t* p = (dbyte_cnt_t*)utarray_front(dbyte_cnt_map); + (p != NULL); + p = (dbyte_cnt_t*)utarray_next(dbyte_cnt_map, p)) { + + if (p->dblByte == dbyte) + return p; + } + return NULL; +} +// ============================================================================ + + +static int ascending_count(const void *lhs, const void *rhs) +{ + const uint32_t lcnt = ((dbyte_cnt_t*)lhs)->count; + const uint32_t rcnt = ((dbyte_cnt_t*)rhs)->count; + return (lcnt - rcnt); // ascending order +} + +static int descending_count(const void *lhs, const void *rhs) +{ + const uint32_t lcnt = ((dbyte_cnt_t*)lhs)->count; + const uint32_t rcnt = ((dbyte_cnt_t*)rhs)->count; + return (rcnt - lcnt); // descending order +} + +// ============================================================================ + +//typedef pair char_count_t; +//typedef map char_count_map_t; +//typedef vector char_count_vec_t; + +static const char NUL = '\0'; +static const char DOS_EOF = '\x1A'; +static const int EVEN = 0; +static const int ODD = 1; + +static size_t nul_count_byte[2]; +static size_t nul_count_word[2]; + + +int Encoding_Analyze(const char* const buffer, const size_t len) { int iEncoding = CPI_NONE; - UT_icd char_count_icd = { sizeof(char_count_t), NULL, NULL, NULL }; - UT_array* char_count_vector = NULL; + bool is_binary = false; + bool is_valid_utf8 = true; + bool is_valid_latin1 = true; + uint32_t dbyte_cnt = 0; + uint32_t dbyte_hihi_cnt = 0; - utarray_new(char_count_vector, &char_count_icd); - utarray_reserve(char_count_vector, 256); + UT_icd dbyte_count_icd = { sizeof(dbyte_cnt_t), NULL, NULL, NULL }; + UT_array* dbyte_count_map = NULL; - ///... + utarray_new(dbyte_count_map, &dbyte_count_icd); + utarray_reserve(dbyte_count_map, MAX_CHAR); - utarray_clear(char_count_vector); - utarray_free(char_count_vector); + //~dbyte_cnt_t sbyte_char_count[MAX_CHAR]; + //~init_sbyte_char_count(sbyte_char_count); + + int last_ch = EOF; + UTF8_ValidationState utf8_valid_state = UTF8_1; + + for (size_t pos = 0; pos < len; ++pos) { + + const unsigned char ch = buffer[pos]; + //~ ++(sbyte_char_count[ch].count); + + // Check for binary data (including UTF-16/32) + if (is_non_text(ch)) { + if (!is_binary && !(ch == DOS_EOF && pos == len - 1)) { + is_binary = true; + } + if (ch == NUL) { + // Count for NULs in even- and odd-number bytes + nul_count_byte[pos & 1]++; + if (pos & 1) { + if (buffer[pos - 1] == NUL) { + // Count for NULs in even- and odd-number words + nul_count_word[(pos / 2) & 1]++; + } + } + } + } + + // Check for UTF-8 validity + if (is_valid_utf8) { + switch (utf8_char_table[ch]) { + case UTF8_INVALID: + is_valid_utf8 = false; + break; + case UTF8_1: + if (utf8_valid_state != UTF8_1) { + is_valid_utf8 = false; + } + break; + case UTF8_2: + if (utf8_valid_state != UTF8_1) { + is_valid_utf8 = false; + } + else { + utf8_valid_state = UTF8_2; + } + break; + case UTF8_3: + if (utf8_valid_state != UTF8_1) { + is_valid_utf8 = false; + } + else { + utf8_valid_state = UTF8_3; + } + break; + case UTF8_4: + if (utf8_valid_state != UTF8_1) { + is_valid_utf8 = false; + } + else { + utf8_valid_state = UTF8_4; + } + break; + case UTF8_TAIL: + if (utf8_valid_state > UTF8_1) { + utf8_valid_state--; + } + else { + is_valid_utf8 = false; + } + break; + } + } + + // Check whether non-Latin1 characters appear + if (is_valid_latin1) { + if (ch >= 0x80 && ch < 0xa0) { + is_valid_latin1 = false; + } + } + + // Construct double-bytes and count + if (last_ch != EOF) + { + dbyte_cnt_t dbyte_item = { 0, 1 }; + dbyte_item.dblByte = (uint16_t)((last_ch << 8) + ch); + + dbyte_cnt_t* item = find_dbyte_count(dbyte_count_map, dbyte_item.dblByte); + if (item == NULL) + utarray_push_back(dbyte_count_map, &dbyte_item); + else + ++(item->count); + + dbyte_cnt++; + if ((last_ch > 0xa0) && (ch > 0xa0)) { + ++dbyte_hihi_cnt; + } + //last_ch = EOF; + } + + if (ch >= 0x80) + last_ch = ch; + else + last_ch = EOF; + + } // for + + if (!is_valid_utf8 && is_binary) { + // Heuristics for UTF-16/32 + if (nul_count_byte[EVEN] > 4 && + (nul_count_byte[ODD] == 0 || + nul_count_byte[EVEN] / nul_count_byte[ODD] > 20)) { + iEncoding = CPI_UNICODEBE; + } + else if (nul_count_byte[ODD] > 4 && + (nul_count_byte[EVEN] == 0 || + nul_count_byte[ODD] / nul_count_byte[EVEN] > 20)) { + iEncoding = CPI_UNICODE; + } + else if (nul_count_word[EVEN] > 4 && + (nul_count_word[ODD] == 0 || + nul_count_word[EVEN] / nul_count_word[ODD] > 20)) { + iEncoding = CPI_UCS4BE; // utf-32 is not a built-in encoding for Notepad3 + } + else if (nul_count_word[ODD] > 4 && + (nul_count_word[EVEN] == 0 || + nul_count_word[ODD] / nul_count_word[EVEN] > 20)) { + iEncoding = CPI_UCS4; // utf-32le is not a built-in encoding for Notepad3 + } + } + else if (dbyte_cnt == 0) { + // No characters outside the scope of ASCII + iEncoding = CPI_ANSI_DEFAULT; + } + else if (is_valid_utf8) { + // Only valid UTF-8 sequences + iEncoding = CPI_UTF8; + } + + if (iEncoding == CPI_NONE) // still unknown ? + { + // Get the character counts in descending order + //~qsort((void*)sbyte_char_count, MAX_CHAR, sizeof(dbyte_cnt_t), descending_count); + + // Get the double-byte counts in descending order + utarray_sort(dbyte_count_map, descending_count); + + const int probEncoding = search_freq_dbytes(dbyte_count_map); + + if (probEncoding != CPI_NONE) { + iEncoding = probEncoding; + } + else if (((dbyte_hihi_cnt * 100) / ++dbyte_cnt) < 5) { + // mostly a low-byte follows a high-byte + iEncoding = CPI_ANSI_DEFAULT; + } + } + + utarray_clear(dbyte_count_map); + utarray_free(dbyte_count_map); - UNUSED(buffer); - UNUSED(len); return iEncoding; } // ============================================================================ - +// ============================================================================ +// ============================================================================ +// +// END OF "TELLENC" PART +// // ============================================================================ // ============================================================================ -void Encoding_InitDefaults() + +void Encoding_InitDefaults() { + // init tellenc code page detection + init_utf8_validation_char_table(); + const UINT uCodePageMBCS[20] = { 42, // (Symbol) 50220,50221,50222,50225,50227,50229, // (Chinese, Japanese, Korean) @@ -871,85 +1186,86 @@ BOOL Encoding_GetFromComboboxEx(HWND hwnd, int *pidEncoding) { UINT Encoding_GetCodePage(int iEncoding) { - return g_Encodings[iEncoding].uCodePage; + return (iEncoding >= 0) ? g_Encodings[iEncoding].uCodePage : CP_ACP; } // ============================================================================ BOOL Encoding_IsDefault(int iEncoding) { - return (g_Encodings[iEncoding].uFlags & NCP_DEFAULT); + return (iEncoding >= 0) ? (g_Encodings[iEncoding].uFlags & NCP_DEFAULT) : FALSE; } // ============================================================================ BOOL Encoding_IsANSI(int iEncoding) { - return (g_Encodings[iEncoding].uFlags & NCP_ANSI); + return (iEncoding >= 0) ? (g_Encodings[iEncoding].uFlags & NCP_ANSI) : FALSE; } // ============================================================================ BOOL Encoding_IsOEM(int iEncoding) { - return (g_Encodings[iEncoding].uFlags & NCP_OEM); + return (iEncoding >= 0) ? (g_Encodings[iEncoding].uFlags & NCP_OEM) : FALSE; } // ============================================================================ BOOL Encoding_IsUTF8(int iEncoding) { - return (g_Encodings[iEncoding].uFlags & NCP_UTF8); + return (iEncoding >= 0) ? (g_Encodings[iEncoding].uFlags & NCP_UTF8) : FALSE; } // ============================================================================ BOOL Encoding_IsUTF8_SIGN(int iEncoding) { - return (g_Encodings[iEncoding].uFlags & NCP_UTF8_SIGN); + return (iEncoding >= 0) ? (g_Encodings[iEncoding].uFlags & NCP_UTF8_SIGN) : FALSE; } // ============================================================================ BOOL Encoding_IsMBCS(int iEncoding) { - return (g_Encodings[iEncoding].uFlags & NCP_MBCS); + return (iEncoding >= 0) ? (g_Encodings[iEncoding].uFlags & NCP_MBCS) : FALSE; } // ============================================================================ BOOL Encoding_IsUNICODE(int iEncoding) { - return (g_Encodings[iEncoding].uFlags & NCP_UNICODE); + return (iEncoding >= 0) ? (g_Encodings[iEncoding].uFlags & NCP_UNICODE) : FALSE; } // ============================================================================ BOOL Encoding_IsUNICODE_BOM(int iEncoding) { - return (g_Encodings[iEncoding].uFlags & NCP_UNICODE_BOM); + return (iEncoding >= 0) ? (g_Encodings[iEncoding].uFlags & NCP_UNICODE_BOM) : FALSE; } // ============================================================================ BOOL Encoding_IsUNICODE_REVERSE(int iEncoding) { - return (g_Encodings[iEncoding].uFlags & NCP_UNICODE_REVERSE); + return (iEncoding >= 0) ? (g_Encodings[iEncoding].uFlags & NCP_UNICODE_REVERSE) : FALSE; } // ============================================================================ BOOL Encoding_IsINTERNAL(int iEncoding) { - return (g_Encodings[iEncoding].uFlags & NCP_INTERNAL); + return (iEncoding >= 0) ? (g_Encodings[iEncoding].uFlags & NCP_INTERNAL) : FALSE; } // ============================================================================ BOOL Encoding_IsEXTERNAL_8BIT(int iEncoding) { - return (g_Encodings[iEncoding].uFlags & NCP_EXTERNAL_8BIT); + return (iEncoding >= 0) ? (g_Encodings[iEncoding].uFlags & NCP_EXTERNAL_8BIT) : FALSE; } // ============================================================================ BOOL Encoding_IsRECODE(int iEncoding) { - return (g_Encodings[iEncoding].uFlags & NCP_RECODE); + return (iEncoding >= 0) ? (g_Encodings[iEncoding].uFlags & NCP_RECODE) : FALSE; } // ============================================================================ void Encoding_SetDefaultFlag(int iEncoding) { - g_Encodings[iEncoding].uFlags |= NCP_DEFAULT; + if (iEncoding >= 0) + g_Encodings[iEncoding].uFlags |= NCP_DEFAULT; } // ============================================================================ const WCHAR* Encoding_GetLabel(int iEncoding) { - return g_Encodings[iEncoding].wchLabel; + return (iEncoding >= 0) ? g_Encodings[iEncoding].wchLabel : NULL; } // ============================================================================ const char* Encoding_GetParseNames(int iEncoding) { - return g_Encodings[iEncoding].pszParseNames; + return (iEncoding >= 0) ? g_Encodings[iEncoding].pszParseNames : NULL; } // ============================================================================ @@ -1251,324 +1567,3 @@ INT UTF8_mbslen(LPCSTR source, INT byte_length) return wchar_length; } // ============================================================================ - - - - -/* -* Copyright (C) 2006-2016 Wu Yongwei -* -* This software is provided 'as-is', without any express or implied -* warranty. In no event will the authors be held liable for any -* damages arising from the use of this software. -* -* Permission is granted to anyone to use this software for any purpose, -* including commercial applications, and to alter it and redistribute -* it freely, subject to the following restrictions: -* -* 1. The origin of this software must not be misrepresented; you must -* not claim that you wrote the original software. If you use this -* software in a product, an acknowledgement in the product -* documentation would be appreciated but is not required. -* 2. Altered source versions must be plainly marked as such, and must -* not be misrepresented as being the original software. -* 3. This notice may not be removed or altered from any source -* distribution. -* -* -* The latest version of this software should be available at: -* -* -*/ - -/** -* @file TellEnc.c -* -* Program to detect the encoding of text. It currently supports ASCII, -* UTF-8, UTF-16/32 (little-endian or big-endian), Latin1, Windows-1252, -* CP437, GB2312, GBK, Big5, and SJIS, among others. -* -* @version 1.22, 2016/07/26 -* @author Wu Yongwei -*/ - -#define MAX_CHAR 256 - - - - - - -static const unsigned char NON_TEXT_CHARS[] = { 0, 26, 127, 255 }; -static const char NUL = '\0'; -static const char DOS_EOF = '\x1A'; -static const int EVEN = 0; -static const int ODD = 1; - - -// ============================================================================ - - -static size_t nul_count_byte[2]; -static size_t nul_count_word[2]; - -static bool is_binary = false; -static bool is_valid_utf8 = true; -static bool is_valid_latin1 = true; -static uint32_t dbyte_cnt = 0; -static uint32_t dbyte_hihi_cnt = 0; - - -// ============================================================================ -// ============================================================================ - - -static inline bool is_non_text(char ch) -{ - for (size_t i = 0; i < sizeof(NON_TEXT_CHARS); ++i) { - if (ch == NON_TEXT_CHARS[i]) { - return true; - } - } - return false; -} -// ============================================================================ - - - - -static void init_sbyte_char_count(char_count_t sbyte_char_cnt[]) -{ - for (size_t i = 0; i < MAX_CHAR; ++i) { - sbyte_char_cnt[i].first = (uint16_t)i; - sbyte_char_cnt[i].second = 0; - } -} -// ============================================================================ - - - - - - -#if FALSE - - -typedef struct _pattern_t { - const char* name; - const char* pattern; - size_t pattern_len; -} pattern_t; - -static const char* check_ucs_bom(const unsigned char* const buffer, const size_t len) -{ - const pattern_t patterns[] = { - { "ucs-4", "\x00\x00\xFE\xFF", 4 }, - { "ucs-4le", "\xFF\xFE\x00\x00", 4 }, - { "utf-8", "\xEF\xBB\xBF", 3 }, - { "utf-16", "\xFE\xFF", 2 }, - { "utf-16le", "\xFF\xFE", 2 }, - { NULL, NULL, 0 } - }; - for (size_t i = 0; patterns[i].name; ++i) { - const pattern_t* item = &(patterns[i]); - if (len >= item->pattern_len && memcmp(buffer, item->pattern, item->pattern_len) == 0) { - return item->name; - } - } - return NULL; -} -// ============================================================================ - - - - - - -const char* tellenc(const unsigned char* const buffer, const size_t len) -{ - if (len == 0) { - return "unknown"; - } - - const char* result = check_ucs_bom(buffer, len); - if (result) { - return result; - } - - char_count_t sbyte_char_cnt[MAX_CHAR]; - char_count_map_t dbyte_char_cnt_map; - init_sbyte_char_count(sbyte_char_cnt); - - unsigned char ch; - int last_ch = EOF; - int utf8_state = UTF8_1; - for (size_t i = 0; i < len; ++i) { - ch = buffer[i]; - sbyte_char_cnt[ch].second++; - - // Check for binary data (including UTF-16/32) - if (is_non_text(ch)) { - if (!is_binary && !(ch == DOS_EOF && i == len - 1)) { - is_binary = true; - } - if (ch == NUL) { - // Count for NULs in even- and odd-number bytes - nul_count_byte[i & 1]++; - if (i & 1) { - if (buffer[i - 1] == NUL) { - // Count for NULs in even- and odd-number words - nul_count_word[(i / 2) & 1]++; - } - } - } - } - - // Check for UTF-8 validity - if (is_valid_utf8) { - switch (utf8_char_table[ch]) { - case UTF8_INVALID: - is_valid_utf8 = false; - break; - case UTF8_1: - if (utf8_state != UTF8_1) { - is_valid_utf8 = false; - } - break; - case UTF8_2: - if (utf8_state != UTF8_1) { - is_valid_utf8 = false; - } else { - utf8_state = UTF8_2; - } - break; - case UTF8_3: - if (utf8_state != UTF8_1) { - is_valid_utf8 = false; - } else { - utf8_state = UTF8_3; - } - break; - case UTF8_4: - if (utf8_state != UTF8_1) { - is_valid_utf8 = false; - } else { - utf8_state = UTF8_4; - } - break; - case UTF8_TAIL: - if (utf8_state > UTF8_1) { - utf8_state--; - } else { - is_valid_utf8 = false; - } - break; - } - } - - // Check whether non-Latin1 characters appear - if (is_valid_latin1) { - if (ch >= 0x80 && ch < 0xa0) { - is_valid_latin1 = false; - } - } - - // Construct double-bytes and count - if (last_ch != EOF) { - uint16_t dbyte_char = (last_ch << 8) + ch; - dbyte_char_cnt_map[dbyte_char]++; - dbyte_cnt++; - if (last_ch > 0xa0 && ch > 0xa0) { - dbyte_hihi_cnt++; - } - last_ch = EOF; - } else if (ch >= 0x80) { - last_ch = ch; - } - } - - // Get the character counts in descending order - sort(sbyte_char_cnt, sbyte_char_cnt + MAX_CHAR, greater_char_count()); - - // Get the double-byte counts in descending order - char_count_vec_t dbyte_char_cnt; - for (char_count_map_t::iterator it = dbyte_char_cnt_map.begin(); - it != dbyte_char_cnt_map.end(); ++it) { - dbyte_char_cnt.push_back(*it); - } - sort(dbyte_char_cnt.begin(), - dbyte_char_cnt.end(), - greater_char_count()); - - if (!is_valid_utf8 && is_binary) { - // Heuristics for UTF-16/32 - if (nul_count_byte[EVEN] > 4 && - (nul_count_byte[ODD] == 0 || - nul_count_byte[EVEN] / nul_count_byte[ODD] > 20)) { - return "utf-16"; - } else if (nul_count_byte[ODD] > 4 && - (nul_count_byte[EVEN] == 0 || - nul_count_byte[ODD] / nul_count_byte[EVEN] > 20)) { - return "utf-16le"; - } else if (nul_count_word[EVEN] > 4 && - (nul_count_word[ODD] == 0 || - nul_count_word[EVEN] / nul_count_word[ODD] > 20)) { - return "ucs-4"; // utf-32 is not a built-in encoding for Vim - } else if (nul_count_word[ODD] > 4 && - (nul_count_word[EVEN] == 0 || - nul_count_word[ODD] / nul_count_word[EVEN] > 20)) { - return "ucs-4le"; // utf-32le is not a built-in encoding for Vim - } else { - return "binary"; - } - } else if (dbyte_cnt == 0) { - // No characters outside the scope of ASCII - return "ascii"; - } else if (is_valid_utf8) { - // Only valid UTF-8 sequences - return "utf-8"; - } else if (const char* enc = search_freq_dbytes(dbyte_char_cnt)) { - return enc; - } else if (dbyte_hihi_cnt * 100 / dbyte_cnt < 5) { - // Mostly a low-byte follows a high-byte - return "windows-1252"; - } - return NULL; -} -// ============================================================================ - - -#endif -const char* tellenc(const unsigned char* const buffer, const size_t len) { UNUSED(buffer); UNUSED(len); return NULL; } - - -const char* tellenc_simplify(const char* const buffer, const size_t len) -{ - const char* enc = tellenc((const unsigned char*)buffer, len); - if (enc) { - if (strcmp(enc, "windows-1252") == 0 && is_valid_latin1) { - // Latin1 is subset of Windows-1252 - return "latin1"; - } else if (strcmp(enc, "gbk") == 0 && dbyte_hihi_cnt == dbyte_cnt) { - // Special case for GB2312: no high-byte followed by a low-byte - return "gb2312"; - } - } - return enc; -} -// ============================================================================ - - - -static bool bInitDone = false; - -int GetBufferEncoding(const char* const buffer, const size_t len) -{ - const char* enc = tellenc_simplify(buffer, len); - - if (enc) - return 1; - - return 0; // unknown -} -// ============================================================================ diff --git a/src/Encoding.h b/src/Encoding.h index cd92f7be0..cf69d0f2a 100644 --- a/src/Encoding.h +++ b/src/Encoding.h @@ -65,6 +65,13 @@ extern int g_DOSEncoding; #define CPI_UTF8SIGN 7 #define CPI_UTF7 8 +#define CPI_UTF32 CPI_NONE // invalid +#define CPI_UTF32BE CPI_NONE // invalid +#define CPI_UCS4 CPI_UTF32 // invalid +#define CPI_UCS4BE CPI_UTF32BE // invalid + +#define Encoding_IsNONE(enc) ((enc) == CPI_NONE) + #define IDS_ENCODINGNAME0 61000 #define IDS_EOLMODENAME0 62000 @@ -130,9 +137,7 @@ BOOL IsUTF7(const char*, int); INT UTF8_mbslen_bytes(LPCSTR utf8_string); INT UTF8_mbslen(LPCSTR source, INT byte_length); - -int Encoding_GetEncoding(const char* const, const size_t); - +int Encoding_Analyze(const char* const, const size_t); // -------------------------------------------------------------------------------------------------------------------------------- diff --git a/src/Helpers.c b/src/Helpers.c index 8bd7796cc..ee132a3b7 100644 --- a/src/Helpers.c +++ b/src/Helpers.c @@ -27,10 +27,8 @@ #define VC_EXTRALEAN 1 #include +//#include #include -#include -#include -#include //#include #include "scintilla.h" #include "resource.h" diff --git a/src/Helpers.h b/src/Helpers.h index d991f2c60..65c7cdb7e 100644 --- a/src/Helpers.h +++ b/src/Helpers.h @@ -16,11 +16,11 @@ #ifndef _NP3_HELPERS_H_ #define _NP3_HELPERS_H_ -#include #define STRSAFE_NO_CB_FUNCTIONS -#undef STRSAFE_NO_DEPRECATE // don't allow deprecated functions +#define STRSAFE_NO_DEPRECATE // don't allow deprecated functions #include #include +#include #include "typedefs.h" @@ -39,7 +39,6 @@ extern WCHAR g_wchIniFile[MAX_PATH]; #define COUNTOF(ar) ARRAYSIZE(ar) //#define COUNTOF(ar) (sizeof(ar)/sizeof(ar[0])) #define CSTRLEN(s) (COUNTOF(s)-1) - __forceinline void swapi(int* a, int* b) { int t = *a; *a = *b; *b = t; } __forceinline void swapos(DocPos* a, DocPos* b) { DocPos t = *a; *a = *b; *b = t; } @@ -53,8 +52,7 @@ __forceinline void swapos(DocPos* a, DocPos* b) { DocPos t = *a; *a = *b; *b = WritePrivateProfileString(lpSection,lpName,(lpString),g_wchIniFile) #define IniDeleteSection(lpSection) \ WritePrivateProfileSection(lpSection,NULL,g_wchIniFile) -__inline BOOL IniSetInt(LPCWSTR lpSection, LPCWSTR lpName, int i) -{ +__inline BOOL IniSetInt(LPCWSTR lpSection, LPCWSTR lpName, int i) { WCHAR tch[32] = { L'\0' }; StringCchPrintf(tch, COUNTOF(tch), L"%i", i); return IniSetString(lpSection, lpName, tch); } #define IniSetBool(lpSection,lpName,nValue) \ @@ -73,8 +71,7 @@ BOOL IniSectionSetString(LPWSTR,LPCWSTR,LPCWSTR); __inline BOOL IniSectionSetInt(LPWSTR lpCachedIniSection,LPCWSTR lpName,int i) { WCHAR tch[32]={L'\0'}; StringCchPrintf(tch,COUNTOF(tch),L"%i",i); return IniSectionSetString(lpCachedIniSection,lpName,tch); } -__inline BOOL IniSectionSetBool(LPWSTR lpCachedIniSection, LPCWSTR lpName, BOOL b) -{ +__inline BOOL IniSectionSetBool(LPWSTR lpCachedIniSection, LPCWSTR lpName, BOOL b) { return IniSectionSetInt(lpCachedIniSection, lpName, (b ? 1 : 0)); } diff --git a/src/Notepad3.rc b/src/Notepad3.rc index ad5c7cc41..d68e3aca2 100644 --- a/src/Notepad3.rc +++ b/src/Notepad3.rc @@ -751,16 +751,16 @@ FONT 8, "MS Shell Dlg", 400, 0, 0x1 BEGIN LTEXT "&Default encoding (new file):",IDC_STATIC,7,7,90,8 CONTROL "",IDC_ENCODINGLIST,"ComboBoxEx32",CBS_DROPDOWNLIST | WS_CLIPSIBLINGS | WS_VSCROLL | WS_TABSTOP,7,20,167,128 - CONTROL "Fallback on detection failure.", IDC_USEASREADINGFALLBACK, - "Button", BS_AUTOCHECKBOX | WS_TABSTOP, 7, 40, 108, 10 - CONTROL "Skip automatic &Unicode detection.",IDC_NOUNICODEDETECTION, - "Button",BS_AUTOCHECKBOX | WS_TABSTOP,7,51,124,10 + CONTROL "Use as &fallback on detection failure.",IDC_USEASREADINGFALLBACK, + "Button",BS_AUTOCHECKBOX | WS_TABSTOP,7,40,155,10 + CONTROL "Skip &encoding detection.",IDC_NOUNICODEDETECTION, + "Button",BS_AUTOCHECKBOX | WS_TABSTOP,7,51,122,10 CONTROL "Open 7-bit &ASCII files in UTF-8 mode.",IDC_ASCIIASUTF8, "Button",BS_AUTOCHECKBOX | WS_TABSTOP,7,62,136,10 CONTROL "Open 8-bit *.&nfo/diz files in DOS-437 mode.",IDC_NFOASOEM, - "Button",BS_AUTOCHECKBOX | WS_TABSTOP,7,73,167,10 - CONTROL "Don't parse encoding &tags.",IDC_ENCODINGFROMFILEVARS, - "Button",BS_AUTOCHECKBOX | WS_TABSTOP,7,84,102,10 + "Button",BS_AUTOCHECKBOX | WS_TABSTOP,7,73,155,10 + CONTROL "Don't parse encoding file &tags.",IDC_ENCODINGFROMFILEVARS, + "Button",BS_AUTOCHECKBOX | WS_TABSTOP,7,84,126,10 DEFPUSHBUTTON "OK",IDOK,68,101,50,14 PUSHBUTTON "Cancel",IDCANCEL,124,101,50,14 END