Notepad3/grepWinNP3/sktoolslib_mod/TextFile.cpp
2023-05-03 15:59:29 +02:00

625 lines
20 KiB
C++

// sktoolslib - common files for SK tools
// Copyright (C) 2012, 2014, 2017-2023 - Stefan Kueng
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software Foundation,
// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
//
#include "stdafx.h"
#include "TextFile.h"
#include "PathUtils.h"
#include "maxpath.h"
#include <memory>
static wchar_t WideCharSwap(wchar_t nValue)
{
return (((nValue >> 8)) | (nValue << 8));
}
static UINT64 WordSwapBytes(UINT64 nValue)
{
return ((nValue & 0xff00ff00ff00ff) << 8) | ((nValue >> 8) & 0xff00ff00ff00ff); // swap BYTESs in WORDs
}
CTextFile::CTextFile()
: pFileBuf(nullptr)
, fileLen(0)
, encoding(AutoType)
, hasBOM(false)
, nullByteCount(2)
{
}
CTextFile::~CTextFile()
{
pFileBuf = nullptr;
}
bool CTextFile::Save(LPCWSTR path, bool keepFileDate) const
{
if (pFileBuf == nullptr)
return false;
FILETIME creationTime{};
FILETIME lastAccessTime{};
FILETIME lastWriteTime{};
if (keepFileDate)
{
HANDLE hFile = CreateFile(path, GENERIC_READ, FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
nullptr, OPEN_EXISTING, FILE_FLAG_SEQUENTIAL_SCAN, nullptr);
if (hFile != INVALID_HANDLE_VALUE)
{
GetFileTime(hFile, &creationTime, &lastAccessTime, &lastWriteTime);
CloseHandle(hFile);
}
}
HANDLE hFile = CreateFile(path, GENERIC_WRITE, FILE_SHARE_READ,
nullptr, CREATE_ALWAYS, FILE_FLAG_SEQUENTIAL_SCAN, nullptr);
if (hFile == INVALID_HANDLE_VALUE)
{
assert(false);
return false;
}
DWORD byteswritten;
if (!WriteFile(hFile, pFileBuf.get(), fileLen, &byteswritten, nullptr))
{
CloseHandle(hFile);
return false;
}
if (keepFileDate)
SetFileTime(hFile, &creationTime, &lastAccessTime, &lastWriteTime);
CloseHandle(hFile);
return true;
}
bool CTextFile::Load(LPCWSTR path, UnicodeType &type, bool bUTF8, std::atomic_bool &bCancelled)
{
encoding = AutoType;
type = AutoType;
LARGE_INTEGER lint;
pFileBuf = nullptr;
auto pathBuf = std::make_unique<wchar_t[]>(MAX_PATH_NEW);
HANDLE hFile = INVALID_HANDLE_VALUE;
int retryCounter = 0;
if ((wcslen(path) > 2) && (path[0] == '\\') && (path[1] == '\\'))
{
// UNC path
wcscpy_s(pathBuf.get(), MAX_PATH_NEW, L"\\\\?\\UNC");
wcscat_s(pathBuf.get(), MAX_PATH_NEW, &path[1]);
}
else
{
// 'normal' path
wcscpy_s(pathBuf.get(), MAX_PATH_NEW, L"\\\\?\\");
wcscat_s(pathBuf.get(), MAX_PATH_NEW, path);
}
do
{
if (retryCounter)
Sleep(20 + retryCounter * 50);
hFile = CreateFile(pathBuf.get(), GENERIC_READ, FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
nullptr, OPEN_EXISTING, FILE_FLAG_SEQUENTIAL_SCAN, nullptr);
retryCounter++;
} while (hFile == INVALID_HANDLE_VALUE && retryCounter < 5);
if (hFile == INVALID_HANDLE_VALUE)
return false;
std::wstring wPath(path);
size_t pos = wPath.find_last_of('\\');
filename = wPath.substr(pos + 1);
if (!GetFileSizeEx(hFile, &lint))
{
CloseHandle(hFile);
return false;
}
MEMORYSTATUSEX memEx = {sizeof(MEMORYSTATUSEX)};
GlobalMemoryStatusEx(&memEx);
DWORD bytesRead = 0;
DWORD bytesToRead = min(lint.LowPart, DWORD(memEx.ullAvailPhys / 4UL));
if (lint.HighPart)
bytesToRead = 500000; // read 50kb if the file is too big: we only
// need to scan for the file type then.
// if there isn't enough RAM available, only load a small part of the file
// to do the encoding check. Then only load the full file in case
// the encoding is UNICODE_LE since that's the only encoding we have
// to convert first to do a proper search with.
if (bytesToRead < lint.LowPart)
{
try
{
auto tempFileBuf = std::make_unique<BYTE[]>(bytesToRead + 1);
if (!ReadFile(hFile, tempFileBuf.get(), bytesToRead, &bytesRead, nullptr))
{
CloseHandle(hFile);
return false;
}
encoding = CheckUnicodeType(tempFileBuf.get(), bytesRead);
type = encoding;
if (lint.HighPart)
{
CloseHandle(hFile);
return false;
}
switch (encoding)
{
case Binary:
case UTF8:
case Ansi:
CloseHandle(hFile);
return false;
default:
pFileBuf = std::make_unique<BYTE[]>(lint.LowPart);
if (pFileBuf)
{
for (unsigned long bc = 0; bc < bytesRead; ++bc)
{
pFileBuf[bc] = tempFileBuf[bc];
}
}
break;
}
}
catch (const std::exception &)
{
return false;
}
}
else
{
try
{
pFileBuf = std::make_unique<BYTE[]>(lint.LowPart);
}
catch (const std::exception &)
{
return false;
}
}
if ((pFileBuf == nullptr) || (!ReadFile(hFile, pFileBuf.get(), lint.LowPart, &bytesRead, nullptr)))
{
pFileBuf = nullptr;
CloseHandle(hFile);
return false;
}
CloseHandle(hFile);
fileLen = lint.LowPart;
// we have the file read into memory, now we have to find out what
// kind of text file we have here.
if (encoding == AutoType)
{
encoding = CheckUnicodeType(pFileBuf.get(), bytesRead);
if ((bUTF8) && (encoding != Binary))
encoding = UTF8;
}
if (encoding == Unicode_Le)
{
try
{
if ((bytesRead > 1) && (*static_cast<unsigned char *>(pFileBuf.get()) == 0xFF))
{
// remove the BOM
textContent.assign((reinterpret_cast<wchar_t *>(pFileBuf.get()) + 1), (bytesRead / sizeof(wchar_t)) - 1);
hasBOM = true;
}
else
textContent.assign(reinterpret_cast<wchar_t *>(pFileBuf.get()), bytesRead / sizeof(wchar_t));
}
catch (const std::exception &)
{
return false;
}
}
else if (encoding == Unicode_Be)
{
// make in place WORD BYTEs swap
UINT64 *pQw = reinterpret_cast<UINT64 *>(pFileBuf.get());
int nQwords = bytesRead / 8;
for (int nQword = 0; nQword < nQwords; nQword++)
pQw[nQword] = WordSwapBytes(pQw[nQword]);
wchar_t *pW = reinterpret_cast<wchar_t *>(pQw);
int nWords = bytesRead / 2;
for (int nWord = nQwords * 4; nWord < nWords; nWord++)
pW[nWord] = WideCharSwap(pW[nWord]);
try
{
if ((bytesRead > 1) && (*static_cast<unsigned char *>(pFileBuf.get()) == 0xFF))
{
// remove the BOM
textContent.assign((reinterpret_cast<wchar_t *>(pFileBuf.get()) + 1), (bytesRead / sizeof(wchar_t)) - 1);
hasBOM = true;
}
else
textContent.assign(reinterpret_cast<wchar_t *>(pFileBuf.get()), bytesRead / sizeof(wchar_t));
}
catch (const std::exception &)
{
return false;
}
}
else if ((encoding == UTF8) || ((encoding == Binary) && (bUTF8)))
{
try
{
int ret = MultiByteToWideChar(CP_UTF8, 0, reinterpret_cast<LPCSTR>(pFileBuf.get()), bytesRead, nullptr, 0);
auto pWideBuf = std::make_unique<wchar_t[]>(ret + 1);
int ret2 = MultiByteToWideChar(CP_UTF8, 0, reinterpret_cast<LPCSTR>(pFileBuf.get()), bytesRead, pWideBuf.get(), ret + 1);
if (ret2 == ret)
{
if ((ret > 1) && (*pWideBuf.get() == 0xFEFF))
{
// remove the BOM
textContent.assign(pWideBuf.get() + 1, ret - 1);
hasBOM = true;
}
else
textContent.assign(pWideBuf.get(), ret);
}
}
catch (const std::exception &)
{
return false;
}
}
else // if (encoding == ANSI)
{
try
{
int ret = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, reinterpret_cast<LPCSTR>(pFileBuf.get()), bytesRead, nullptr, 0);
auto pWideBuf = std::make_unique<wchar_t[]>(ret + 1);
int ret2 = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, reinterpret_cast<LPCSTR>(pFileBuf.get()), bytesRead, pWideBuf.get(), ret + 1);
if (ret2 == ret)
textContent.assign(pWideBuf.get(), ret);
}
catch (const std::exception &)
{
return false;
}
}
type = encoding;
if (type == Binary)
return true;
return CalculateLines(bCancelled);
}
void CTextFile::SetFileContent(const std::wstring &content)
{
pFileBuf = nullptr;
fileLen = 0;
try
{
if (encoding == Unicode_Le)
{
if (hasBOM)
{
pFileBuf = std::make_unique<BYTE[]>((content.size() + 2) * sizeof(wchar_t));
if (pFileBuf)
{
memcpy(pFileBuf.get(), "\xFF\xFE", sizeof(wchar_t));
memcpy(pFileBuf.get() + 2, content.c_str(), content.size() * sizeof(wchar_t));
fileLen = (static_cast<int>(content.size()) + 1) * sizeof(wchar_t);
}
}
else
{
pFileBuf = std::make_unique<BYTE[]>(content.size() * sizeof(wchar_t));
if (pFileBuf)
{
memcpy(pFileBuf.get(), content.c_str(), content.size() * sizeof(wchar_t));
fileLen = static_cast<int>(content.size()) * sizeof(wchar_t);
}
}
}
else if (encoding == Unicode_Be)
{
if (hasBOM)
{
pFileBuf = std::make_unique<BYTE[]>((content.size() + 2) * sizeof(wchar_t));
if (pFileBuf)
{
memcpy(pFileBuf.get(), "\xFF\xFE", sizeof(wchar_t));
memcpy(pFileBuf.get() + 2, content.c_str(), content.size() * sizeof(wchar_t));
fileLen = (static_cast<int>(content.size()) + 1) * sizeof(wchar_t);
}
}
else
{
pFileBuf = std::make_unique<BYTE[]>(content.size() * sizeof(wchar_t));
if (pFileBuf)
{
memcpy(pFileBuf.get(), content.c_str(), content.size() * sizeof(wchar_t));
fileLen = static_cast<int>(content.size()) * sizeof(wchar_t);
}
}
// make in place WORD BYTEs swap
UINT64 *pQw = reinterpret_cast<UINT64 *>(pFileBuf.get());
int nQwords = fileLen / 8;
for (int nQword = 0; nQword < nQwords; nQword++)
pQw[nQword] = WordSwapBytes(pQw[nQword]);
wchar_t *pW = reinterpret_cast<wchar_t *>(pQw);
int nWords = fileLen / 2;
for (int nWord = nQwords * 4; nWord < nWords; nWord++)
pW[nWord] = WideCharSwap(pW[nWord]);
}
else if (encoding == UTF8)
{
if (hasBOM)
{
int ret = WideCharToMultiByte(CP_UTF8, 0, content.c_str(), -1, nullptr, 0, nullptr, nullptr);
pFileBuf = std::make_unique<BYTE[]>(ret + 3);
if (pFileBuf)
{
memcpy(pFileBuf.get(), "\xEF\xBB\xBF", 3);
int ret2 = WideCharToMultiByte(CP_UTF8, 0, content.c_str(), -1, reinterpret_cast<LPSTR>(pFileBuf.get()) + 3, ret, nullptr, nullptr);
fileLen = ret2 + 2;
if (ret2 != ret)
{
pFileBuf = nullptr;
fileLen = 0;
}
}
}
else
{
int ret = WideCharToMultiByte(CP_UTF8, 0, content.c_str(), -1, nullptr, 0, nullptr, nullptr);
pFileBuf = std::make_unique<BYTE[]>(ret);
if (pFileBuf)
{
int ret2 = WideCharToMultiByte(CP_UTF8, 0, content.c_str(), -1, reinterpret_cast<LPSTR>(pFileBuf.get()), ret, nullptr, nullptr);
fileLen = ret2 - 1;
if (ret2 != ret)
{
pFileBuf = nullptr;
fileLen = 0;
}
}
}
}
else if ((encoding == Ansi) || (encoding == Binary))
{
int ret = WideCharToMultiByte(CP_ACP, 0, content.c_str(), static_cast<int>(content.size()) + 1, nullptr, 0, nullptr, nullptr);
pFileBuf = std::make_unique<BYTE[]>(ret);
if (pFileBuf)
{
int ret2 = WideCharToMultiByte(CP_ACP, 0, content.c_str(), static_cast<int>(content.size()) + 1, reinterpret_cast<LPSTR>(pFileBuf.get()), ret, nullptr, nullptr);
fileLen = ret2 - 1;
if (ret2 != ret)
{
pFileBuf = nullptr;
fileLen = 0;
}
}
}
}
catch (const std::exception &)
{
}
if (pFileBuf)
textContent = content;
else
textContent = L"";
}
bool CTextFile::ContentsModified(std::unique_ptr<BYTE[]> pBuf, DWORD newLen)
{
pFileBuf = std::move(pBuf);
fileLen = newLen;
return true;
}
CTextFile::UnicodeType CTextFile::CheckUnicodeType(BYTE *pBuffer, int cb) const
{
if (cb < 2)
return Ansi;
UINT16 *pVal16 = reinterpret_cast<UINT16 *>(pBuffer);
UINT8 *pVal8 = reinterpret_cast<UINT8 *>(pVal16 + 1);
// scan the whole buffer for a 0x0000 sequence
// if found, we assume a binary file
int nNull = 0;
int nDblNull = 0;
for (int i = 0; i < (cb - 2); i = i + 2)
{
if (0x0000 == *pVal16++)
++nDblNull;
if (0x00 == *pVal8++)
++nNull;
if (0x00 == *pVal8++)
++nNull;
}
if (nDblNull > nullByteCount) // configured value: allow double null chars to account for 'broken' text files
return Binary;
pVal16 = reinterpret_cast<UINT16 *>(pBuffer);
pVal8 = reinterpret_cast<UINT8 *>(pVal16 + 1);
if (*pVal16 == 0xFEFF)
return Unicode_Le;
if (*pVal16 == 0xFFFE)
return Unicode_Be;
if ((nNull > 3) && ((cb % 2) == 0)) // arbitrary value: allow three null chars to account for 'broken' ANSI/UTF8 text files, otherwise consider the file UTF16-LE
return Unicode_Le;
if (cb < 3)
return Ansi;
if (*pVal16 == 0xBBEF)
{
if (*pVal8 == 0xBF)
return UTF8;
}
// check for illegal UTF8 chars
pVal8 = static_cast<UINT8 *>(pBuffer);
for (int i = 0; i < cb; ++i)
{
if ((*pVal8 == 0xC0) || (*pVal8 == 0xC1) || (*pVal8 >= 0xF5))
return Ansi;
pVal8++;
}
pVal8 = static_cast<UINT8 *>(pBuffer);
bool bUTF8 = false;
for (int i = 0; i < (cb - 4); ++i)
{
if ((*pVal8 & 0xE0) == 0xC0)
{
pVal8++;
i++;
if ((*pVal8 & 0xC0) != 0x80)
return Ansi;
bUTF8 = true;
}
if ((*pVal8 & 0xF0) == 0xE0)
{
pVal8++;
i++;
if ((*pVal8 & 0xC0) != 0x80)
return Ansi;
pVal8++;
i++;
if ((*pVal8 & 0xC0) != 0x80)
return Ansi;
bUTF8 = true;
}
if ((*pVal8 & 0xF8) == 0xF0)
{
pVal8++;
i++;
if ((*pVal8 & 0xC0) != 0x80)
return Ansi;
pVal8++;
i++;
if ((*pVal8 & 0xC0) != 0x80)
return Ansi;
pVal8++;
i++;
if ((*pVal8 & 0xC0) != 0x80)
return Ansi;
bUTF8 = true;
}
pVal8++;
}
if (bUTF8)
return UTF8;
return Ansi;
}
bool CTextFile::CalculateLines(std::atomic_bool &bCancelled)
{
// fill an array with starting positions for every line in the loaded file
if (pFileBuf == nullptr)
return false;
if (textContent.empty())
return true;
linePositions.clear();
linePositions.reserve(textContent.size() / 10);
size_t pos = 0;
for (auto it = textContent.begin(); it != textContent.end() && !bCancelled; ++it)
{
if (*it == '\r')
{
++it;
++pos;
if (it != textContent.end())
{
if (*it == '\n')
{
// crlf lineending
linePositions.push_back(pos);
}
else
{
// cr lineending
linePositions.push_back(pos - 1);
}
}
else
break;
}
else if (*it == '\n')
{
// lf lineending
linePositions.push_back(pos);
}
++pos;
}
linePositions.push_back(pos);
return true;
}
long CTextFile::LineFromPosition(long pos) const
{
auto lb = std::lower_bound(linePositions.begin(), linePositions.end(), static_cast<size_t>(pos));
auto lbLine = lb - linePositions.begin();
return static_cast<long>(lbLine + 1);
}
std::wstring CTextFile::GetLineString(long lineNumber) const
{
if ((lineNumber - 2) >= static_cast<long>(linePositions.size()))
return std::wstring();
if (lineNumber <= 0)
return std::wstring();
long startPos = 0;
if (lineNumber > 1)
startPos = static_cast<long>(linePositions[lineNumber - 2]) + 1;
std::wstring endChars(L"\n\0", 2);
size_t endPos = textContent.find_first_of(endChars, startPos);
std::wstring line;
if (endPos != std::wstring::npos)
line = std::wstring(textContent.begin() + startPos, textContent.begin() + endPos);
else
line = std::wstring(textContent.begin() + startPos, textContent.end());
return line;
}
std::wstring CTextFile::GetEncodingString(UnicodeType type)
{
switch (type)
{
case CTextFile::Ansi:
return L"ANSI";
case CTextFile::Unicode_Le:
return L"UTF-16-LE";
case CTextFile::Unicode_Be:
return L"UTF-16-BE";
case CTextFile::UTF8:
return L"UTF8";
case CTextFile::Binary:
return L"BINARY";
default:
break;
}
return {};
}
std::wstring CTextFile::GetFileNameWithoutExtension() const
{
return CPathUtils::GetFileNameWithoutExtension(filename);
}
std::wstring CTextFile::GetFileNameExtension() const
{
return CPathUtils::GetFileExtension(filename);
}