Notepad3/grepWinNP3/sktoolslib_mod/UnicodeUtils.cpp

// sktoolslib - common files for SK tools

// Copyright (C) 2012-2013, 2017, 2020 - Stefan Kueng

// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.

// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.

// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software Foundation,
// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
//

#include "stdafx.h"
#include "UnicodeUtils.h"
#include <memory>

CUnicodeUtils::CUnicodeUtils(void)
{
}

CUnicodeUtils::~CUnicodeUtils(void)
{
}

#ifdef UNICODE
std::string CUnicodeUtils::StdGetUTF8(const std::wstring& wide, bool stopAtNull /* = true*/)
{
    int len = (int)wide.size();
    if (len == 0)
        return std::string();
    int  size   = len * 4;
    auto narrow = std::make_unique<char[]>(size);
    int  ret    = WideCharToMultiByte(CP_UTF8, 0, wide.c_str(), len, narrow.get(), size - 1, nullptr, nullptr);
    narrow[ret] = 0;
    if (stopAtNull)
        return std::string(narrow.get());
    return std::string(narrow.get(), ret);
}

std::string CUnicodeUtils::StdGetANSI(const std::wstring& wide, bool stopAtNull /* = true*/)
{
    int len = (int)wide.size();
    if (len == 0)
        return std::string();
    int  size   = len * 4;
    auto narrow = std::make_unique<char[]>(size);
    int  ret    = WideCharToMultiByte(CP_ACP, 0, wide.c_str(), len, narrow.get(), size - 1, nullptr, nullptr);
    narrow[ret] = 0;
    if (stopAtNull)
        return std::string(narrow.get());
    return std::string(narrow.get(), ret);
}

std::wstring CUnicodeUtils::StdGetUnicode(const std::string& multibyte, bool stopAtNull)
{
    int len = (int)multibyte.size();
    if (len == 0)
        return std::wstring();
    int  size = len * 4;
    auto wide = std::make_unique<wchar_t[]>(size);
    int  ret  = MultiByteToWideChar(CP_UTF8, 0, multibyte.c_str(), len, wide.get(), size - 1);
    wide[ret] = 0;
    if (stopAtNull)
        return std::wstring(wide.get());
    return std::wstring(wide.get(), ret);
}
#endif

std::string WideToMultibyte(const std::wstring& wide, bool stopAtNull /* = true*/)
{
    auto narrow = std::make_unique<char[]>(wide.length() * 3 + 2);
    BOOL defaultCharUsed;
    int  ret    = (int)WideCharToMultiByte(CP_ACP, 0, wide.c_str(), (int)wide.size(), narrow.get(), (int)wide.length() * 3 - 1, ".", &defaultCharUsed);
    narrow[ret] = 0;
    if (stopAtNull)
        return narrow.get();
    return std::string(narrow.get(), ret);
}

std::string WideToUTF8(const std::wstring& wide, bool stopAtNull /* = true*/)
{
    auto narrow = std::make_unique<char[]>(wide.length() * 3 + 2);
    int  ret    = (int)WideCharToMultiByte(CP_UTF8, 0, wide.c_str(), (int)wide.size(), narrow.get(), (int)wide.length() * 3 - 1, nullptr, nullptr);
    narrow[ret] = 0;
    if (stopAtNull)
        return narrow.get();
    return std::string(narrow.get(), ret);
}

std::wstring MultibyteToWide(const std::string& multibyte, bool stopAtNull /* = true*/)
{
    size_t length = multibyte.length();
    if (length == 0)
        return std::wstring();

    auto wide = std::make_unique<wchar_t[]>(multibyte.length() * 2 + 2);
    if (wide == nullptr)
        return std::wstring();
    int ret   = (int)MultiByteToWideChar(CP_ACP, 0, multibyte.c_str(), (int)multibyte.size(), wide.get(), (int)length * 2 - 1);
    wide[ret] = 0;
    if (stopAtNull)
        return wide.get();
    return std::wstring(wide.get(), ret);
}

std::wstring UTF8ToWide(const std::string& multibyte, bool stopAtNull /* = true*/)
{
    size_t length = multibyte.length();
    if (length == 0)
        return std::wstring();

    auto wide = std::make_unique<wchar_t[]>(length * 2 + 2);
    if (wide == nullptr)
        return std::wstring();
    int ret   = (int)MultiByteToWideChar(CP_UTF8, 0, multibyte.c_str(), (int)multibyte.size(), wide.get(), (int)length * 2 - 1);
    wide[ret] = 0;
    if (stopAtNull)
        return wide.get();
    return std::wstring(wide.get(), ret);
}
#ifdef UNICODE
std::wstring UTF8ToString(const std::string& string, bool stopAtNull /* = true*/)
{
    return UTF8ToWide(string, stopAtNull);
}
std::string StringToUTF8(const std::wstring& string, bool stopAtNull /* = true*/) { return WideToUTF8(string, stopAtNull); }
#else
std::string UTF8ToString(const std::string& string, bool stopAtNull /* = true*/)
{
    return WideToMultibyte(UTF8ToWide(string, stopAtNull));
}
std::string StringToUTF8(const std::string& string, bool stopAtNull /* = true*/) { return WideToUTF8(MultibyteToWide(string, stopAtNull)); }
#endif

#pragma warning(push)
#pragma warning(disable : 4200)
struct STRINGRESOURCEIMAGE
{
    WORD  nLength;
    WCHAR achString[];
};
#pragma warning(pop) // C4200

int LoadStringEx(HINSTANCE hInstance, UINT uID, LPWSTR lpBuffer, int nBufferMax, WORD wLanguage)
{
    const STRINGRESOURCEIMAGE* pImage;
    const STRINGRESOURCEIMAGE* pImageEnd;
    ULONG                      nResourceSize;
    HGLOBAL                    hGlobal;
    UINT                       iIndex;
#ifndef UNICODE
    BOOL defaultCharUsed;
#endif
    int ret;

    if (lpBuffer == nullptr)
        return 0;
    lpBuffer[0]     = 0;
    HRSRC hResource = FindResourceEx(hInstance, RT_STRING, MAKEINTRESOURCE(((uID >> 4) + 1)), wLanguage);
    if (!hResource)
    {
        //try the default language before giving up!
        hResource = FindResource(hInstance, MAKEINTRESOURCE(((uID >> 4) + 1)), RT_STRING);
        if (!hResource)
            return 0;
    }
    hGlobal = LoadResource(hInstance, hResource);
    if (!hGlobal)
        return 0;
    pImage = (const STRINGRESOURCEIMAGE*)::LockResource(hGlobal);
    if (!pImage)
        return 0;

    nResourceSize = ::SizeofResource(hInstance, hResource);
    pImageEnd     = (const STRINGRESOURCEIMAGE*)(LPBYTE(pImage) + nResourceSize);
    iIndex        = uID & 0x000f;

    while ((iIndex > 0) && (pImage < pImageEnd))
    {
        pImage = (const STRINGRESOURCEIMAGE*)(LPBYTE(pImage) + (sizeof(STRINGRESOURCEIMAGE) + (pImage->nLength * sizeof(WCHAR))));
        iIndex--;
    }
    if (pImage >= pImageEnd)
        return 0;
    if (pImage->nLength == 0)
        return 0;
#ifdef UNICODE
    ret = pImage->nLength;
    if (ret > nBufferMax)
        ret = nBufferMax;
    wcsncpy_s((wchar_t*)lpBuffer, nBufferMax, pImage->achString, ret);
    lpBuffer[ret] = 0;
#else
    ret           = WideCharToMultiByte(CP_ACP, 0, pImage->achString, pImage->nLength, (LPSTR)lpBuffer, nBufferMax - 1, ".", &defaultCharUsed);
    lpBuffer[ret] = 0;
#endif
    return ret;
}

int GetCodepageFromBuf(LPVOID pBuffer, int cb, bool& hasBOM, bool& inconclusive)
{
    inconclusive = false;
    hasBOM       = false;
    if (cb < 2)
    {
        inconclusive = true;
        return CP_ACP;
    }
    const UINT32* const pVal32 = (UINT32*)pBuffer;
    const UINT16* const pVal16 = (UINT16*)pBuffer;
    const UINT8* const  pVal8  = (UINT8*)pBuffer;
    if (cb >= 4)
    {
        if (*pVal32 == 0x0000FEFF)
        {
            hasBOM = true;
            return 12000; // UTF32_LE
        }
        if (*pVal32 == 0xFFFE0000)
        {
            hasBOM = true;
            return 12001; // UTF32_BE
        }
    }
    // scan the whole buffer for a 0x00000000 sequence
    // if found, we assume a binary file
    int nDwords = cb / 4;
    for (int i = 0; i < nDwords; ++i)
    {
        if (0x00000000 == pVal32[i])
            return -1;
    }
    if (*pVal16 == 0xFEFF)
    {
        hasBOM = true;
        return 1200; // UTF16_LE
    }
    if (*pVal16 == 0xFFFE)
    {
        hasBOM = true;
        return 1201; // UTF16_BE
    }
    if (cb < 3)
    {
        inconclusive = true;
        return CP_ACP;
    }
    if (*pVal16 == 0xBBEF)
    {
        if (pVal8[2] == 0xBF)
        {
            hasBOM = true;
            return CP_UTF8;
        }
    }
    // check for illegal UTF8 sequences
    bool bNonANSI  = false;
    int  nNeedData = 0;
    int  i         = 0;
    int  nullcount = 0;
    for (; i < cb; ++i)
    {
        UINT8 zChar = pVal8[i];
        if ((zChar & 0x80) == 0) // ASCII
        {
            if (zChar == 0)
            {
                ++nullcount;
                // count the null chars, we do not want to treat an ASCII/UTF8 file
                // as UTF16 just because of some null chars that might be accidentally
                // in the file.
                // Use an arbitrary value of one fiftieth of the file length as
                // the limit after which a file is considered UTF16.
                if (nullcount > (cb / 50))
                {
                    // null-chars are not allowed for ASCII or UTF8, that means
                    // this file is most likely UTF16 encoded
                    if (i % 2)
                        return 1200; // UTF16_LE
                    else
                        return 1201; // UTF16_BE
                }
                nNeedData = 0;
            }
            else if (nNeedData)
            {
                return CP_ACP;
            }
            continue;
        }
        else
            bNonANSI = true;
        if ((zChar & 0x40) == 0) // top bit
        {
            if (!nNeedData)
                return CP_ACP;
            --nNeedData;
        }
        else if (nNeedData)
        {
            return CP_ACP;
        }
        else if ((zChar & 0x20) == 0) // top two bits
        {
            if (zChar <= 0xC1)
                return CP_ACP;
            nNeedData = 1;
        }
        else if ((zChar & 0x10) == 0) // top three bits
        {
            nNeedData = 2;
        }
        else if ((zChar & 0x08) == 0) // top four bits
        {
            if (zChar >= 0xf5)
                return CP_ACP;
            nNeedData = 3;
        }
        else
            return CP_ACP;
    }
    if (bNonANSI && nNeedData == 0)
        return CP_UTF8;

    inconclusive = true;

    return CP_ACP;
}