Notepad3/grepWinNP3/sktoolslib_mod/TextFile.cpp
2020-06-10 13:12:32 +02:00

585 lines
18 KiB
C++

// sktoolslib - common files for SK tools
// Copyright (C) 2012, 2014, 2017-2020 - Stefan Kueng
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software Foundation,
// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
//
#include "stdafx.h"
#include "TextFile.h"
#include "PathUtils.h"
#include "maxpath.h"
#include <memory>
static wchar_t inline WideCharSwap(wchar_t nValue)
{
return (((nValue >> 8)) | (nValue << 8));
}
static UINT64 inline WordSwapBytes(UINT64 nValue)
{
return ((nValue & 0xff00ff00ff00ff) << 8) | ((nValue >> 8) & 0xff00ff00ff00ff); // swap BYTESs in WORDs
}
CTextFile::CTextFile(void)
: pFileBuf(nullptr)
, filelen(0)
, hasBOM(false)
, encoding(AUTOTYPE)
{
}
CTextFile::~CTextFile(void)
{
pFileBuf = nullptr;
}
bool CTextFile::Save(LPCTSTR path)
{
if (pFileBuf == nullptr)
return false;
HANDLE hFile = CreateFile(path, GENERIC_WRITE, FILE_SHARE_READ,
NULL, CREATE_ALWAYS, FILE_FLAG_SEQUENTIAL_SCAN, NULL);
if (hFile == INVALID_HANDLE_VALUE)
return false;
DWORD byteswritten;
if (!WriteFile(hFile, pFileBuf.get(), filelen, &byteswritten, NULL))
{
CloseHandle(hFile);
return false;
}
CloseHandle(hFile);
return true;
}
bool CTextFile::Load(LPCTSTR path, UnicodeType &type, bool bUTF8, volatile LONG* bCancelled)
{
encoding = AUTOTYPE;
type = AUTOTYPE;
LARGE_INTEGER lint;
pFileBuf = nullptr;
auto pathbuf = std::make_unique<wchar_t[]>(MAX_PATH_NEW);
HANDLE hFile = INVALID_HANDLE_VALUE;
int retrycounter = 0;
if ((_tcslen(path) > 2) && (path[0] == '\\') && (path[1] == '\\'))
{
// UNC path
_tcscpy_s(pathbuf.get(), MAX_PATH_NEW, L"\\\\?\\UNC");
_tcscat_s(pathbuf.get(), MAX_PATH_NEW, &path[1]);
}
else
{
// 'normal' path
_tcscpy_s(pathbuf.get(), MAX_PATH_NEW, L"\\\\?\\");
_tcscat_s(pathbuf.get(), MAX_PATH_NEW, path);
}
do
{
if (retrycounter)
Sleep(20);
hFile = CreateFile(pathbuf.get(), GENERIC_READ, FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
NULL, OPEN_EXISTING, FILE_FLAG_SEQUENTIAL_SCAN, NULL);
retrycounter++;
} while (hFile == INVALID_HANDLE_VALUE && retrycounter < 5);
if (hFile == INVALID_HANDLE_VALUE)
return false;
std::wstring wpath(path);
size_t pos = wpath.find_last_of('\\');
filename = wpath.substr(pos + 1);
if (!GetFileSizeEx(hFile, &lint))
{
CloseHandle(hFile);
return false;
}
MEMORYSTATUSEX memex = {sizeof(MEMORYSTATUSEX)};
GlobalMemoryStatusEx(&memex);
DWORD bytesread = 0;
DWORD bytestoread = min(lint.LowPart, DWORD(memex.ullAvailPhys / 4UL));
if (lint.HighPart)
bytestoread = 500000; // read 50kb if the file is too big: we only
// need to scan for the file type then.
// if there isn't enough RAM available, only load a small part of the file
// to do the encoding check. Then only load the full file in case
// the encoding is UNICODE_LE since that's the only encoding we have
// to convert first to do a proper search with.
if (bytestoread < lint.LowPart)
{
try
{
auto tempfilebuf = std::make_unique<BYTE[]>(bytestoread + 1);
if (!ReadFile(hFile, tempfilebuf.get(), bytestoread, &bytesread, NULL))
{
CloseHandle(hFile);
return false;
}
encoding = CheckUnicodeType(tempfilebuf.get(), bytesread);
type = encoding;
if (lint.HighPart)
{
CloseHandle(hFile);
return false;
}
switch (encoding)
{
case BINARY:
case UTF8:
case ANSI:
CloseHandle(hFile);
return false;
break;
default:
pFileBuf = std::make_unique<BYTE[]>(lint.LowPart);
if (pFileBuf)
{
for (unsigned long bc = 0; bc < bytesread; ++bc)
{
pFileBuf[bc] = tempfilebuf[bc];
}
}
break;
}
}
catch (const std::exception &)
{
return false;
}
}
else
{
try
{
pFileBuf = std::make_unique<BYTE[]>(lint.LowPart);
}
catch (const std::exception &)
{
return false;
}
}
if ((pFileBuf == nullptr) || (!ReadFile(hFile, pFileBuf.get(), lint.LowPart, &bytesread, NULL)))
{
pFileBuf = nullptr;
CloseHandle(hFile);
return false;
}
CloseHandle(hFile);
filelen = lint.LowPart;
// we have the file read into memory, now we have to find out what
// kind of text file we have here.
if (encoding == AUTOTYPE)
{
encoding = CheckUnicodeType(pFileBuf.get(), bytesread);
if ((bUTF8) && (encoding != BINARY))
encoding = UTF8;
}
if (encoding == UNICODE_LE)
{
try
{
if ((bytesread > 1) && (*(unsigned char *)pFileBuf.get() == 0xFF))
{
// remove the BOM
textcontent.assign(((wchar_t *)pFileBuf.get() + 1), (bytesread / sizeof(wchar_t)) - 1);
hasBOM = true;
}
else
textcontent.assign((wchar_t *)pFileBuf.get(), bytesread / sizeof(wchar_t));
}
catch (const std::exception &)
{
return false;
}
}
else if (encoding == UNICODE_BE)
{
// make in place WORD BYTEs swap
UINT64* p_qw = (UINT64*)pFileBuf.get();
int nQwords = bytesread / 8;
for (int nQword = 0; nQword < nQwords; nQword++)
p_qw[nQword] = WordSwapBytes(p_qw[nQword]);
wchar_t* p_w = (wchar_t*)p_qw;
int nWords = bytesread / 2;
for (int nWord = nQwords * 4; nWord < nWords; nWord++)
p_w[nWord] = WideCharSwap(p_w[nWord]);
try
{
if ((bytesread > 1) && (*(unsigned char*)pFileBuf.get() == 0xFF))
{
// remove the BOM
textcontent.assign(((wchar_t*)pFileBuf.get() + 1), (bytesread / sizeof(wchar_t)) - 1);
hasBOM = true;
}
else
textcontent.assign((wchar_t*)pFileBuf.get(), bytesread / sizeof(wchar_t));
}
catch (const std::exception&)
{
return false;
}
}
else if ((encoding == UTF8) || ((encoding == BINARY) && (bUTF8)))
{
try
{
int ret = MultiByteToWideChar(CP_UTF8, 0, (LPCSTR)pFileBuf.get(), bytesread, NULL, 0);
auto pWideBuf = std::make_unique<wchar_t[]>(ret + 1);
int ret2 = MultiByteToWideChar(CP_UTF8, 0, (LPCSTR)pFileBuf.get(), bytesread, pWideBuf.get(), ret + 1);
if (ret2 == ret)
{
if ((ret > 1) && (*pWideBuf.get() == 0xFEFF))
{
// remove the BOM
textcontent.assign(pWideBuf.get() + 1, ret - 1);
hasBOM = true;
}
else
textcontent.assign(pWideBuf.get(), ret);
}
}
catch (const std::exception &)
{
return false;
}
}
else //if (encoding == ANSI)
{
try
{
int ret = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, (LPCSTR)pFileBuf.get(), bytesread, NULL, 0);
auto pWideBuf = std::make_unique<wchar_t[]>(ret + 1);
int ret2 = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, (LPCSTR)pFileBuf.get(), bytesread, pWideBuf.get(), ret + 1);
if (ret2 == ret)
textcontent.assign(pWideBuf.get(), ret);
}
catch (const std::exception &)
{
return false;
}
}
type = encoding;
if (type == BINARY)
return true;
return CalculateLines(bCancelled);
}
void CTextFile::SetFileContent(const std::wstring &content)
{
pFileBuf = nullptr;
filelen = 0;
try
{
if (encoding == UNICODE_LE)
{
if (hasBOM)
{
pFileBuf = std::make_unique<BYTE[]>((content.size() + 2) * sizeof(wchar_t));
if (pFileBuf)
{
memcpy(pFileBuf.get(), "\xFF\xFE", sizeof(wchar_t));
memcpy(pFileBuf.get() + 2, content.c_str(), content.size() * sizeof(wchar_t));
filelen = ((int)content.size() + 1) * sizeof(wchar_t);
}
}
else
{
pFileBuf = std::make_unique<BYTE[]>(content.size() * sizeof(wchar_t));
if (pFileBuf)
{
memcpy(pFileBuf.get(), content.c_str(), content.size() * sizeof(wchar_t));
filelen = (int)content.size() * sizeof(wchar_t);
}
}
}
else if (encoding == UNICODE_BE)
{
if (hasBOM)
{
pFileBuf = std::make_unique<BYTE[]>((content.size() + 2) * sizeof(wchar_t));
if (pFileBuf)
{
memcpy(pFileBuf.get(), "\xFF\xFE", sizeof(wchar_t));
memcpy(pFileBuf.get() + 2, content.c_str(), content.size() * sizeof(wchar_t));
filelen = ((int)content.size() + 1) * sizeof(wchar_t);
}
}
else
{
pFileBuf = std::make_unique<BYTE[]>(content.size() * sizeof(wchar_t));
if (pFileBuf)
{
memcpy(pFileBuf.get(), content.c_str(), content.size() * sizeof(wchar_t));
filelen = (int)content.size() * sizeof(wchar_t);
}
}
// make in place WORD BYTEs swap
UINT64* p_qw = (UINT64*)pFileBuf.get();
int nQwords = filelen / 8;
for (int nQword = 0; nQword < nQwords; nQword++)
p_qw[nQword] = WordSwapBytes(p_qw[nQword]);
wchar_t* p_w = (wchar_t*)p_qw;
int nWords = filelen / 2;
for (int nWord = nQwords * 4; nWord < nWords; nWord++)
p_w[nWord] = WideCharSwap(p_w[nWord]);
}
else if (encoding == UTF8)
{
if (hasBOM)
{
int ret = WideCharToMultiByte(CP_UTF8, 0, content.c_str(), -1, NULL, 0, NULL, NULL);
pFileBuf = std::make_unique<BYTE[]>(ret + 3);
if (pFileBuf)
{
memcpy(pFileBuf.get(), "\xEF\xBB\xBF", 3);
int ret2 = WideCharToMultiByte(CP_UTF8, 0, content.c_str(), -1, (LPSTR)pFileBuf.get() + 3, ret, NULL, NULL);
filelen = ret2 + 2;
if (ret2 != ret)
{
pFileBuf = nullptr;
filelen = 0;
}
}
}
else
{
int ret = WideCharToMultiByte(CP_UTF8, 0, content.c_str(), -1, NULL, 0, NULL, NULL);
pFileBuf = std::make_unique<BYTE[]>(ret);
if (pFileBuf)
{
int ret2 = WideCharToMultiByte(CP_UTF8, 0, content.c_str(), -1, (LPSTR)pFileBuf.get(), ret, NULL, NULL);
filelen = ret2 - 1;
if (ret2 != ret)
{
pFileBuf = nullptr;
filelen = 0;
}
}
}
}
else if ((encoding == ANSI) || (encoding == BINARY))
{
int ret = WideCharToMultiByte(CP_ACP, 0, content.c_str(), (int)content.size() + 1, NULL, 0, NULL, NULL);
pFileBuf = std::make_unique<BYTE[]>(ret);
if (pFileBuf)
{
int ret2 = WideCharToMultiByte(CP_ACP, 0, content.c_str(), (int)content.size() + 1, (LPSTR)pFileBuf.get(), ret, NULL, NULL);
filelen = ret2 - 1;
if (ret2 != ret)
{
pFileBuf = nullptr;
filelen = 0;
}
}
}
}
catch (const std::exception &)
{
}
if (pFileBuf)
textcontent = content;
else
textcontent = L"";
}
bool CTextFile::ContentsModified(std::unique_ptr<BYTE[]> pBuf, DWORD newLen)
{
pFileBuf = std::move(pBuf);
filelen = newLen;
return true;
}
CTextFile::UnicodeType CTextFile::CheckUnicodeType(BYTE *pBuffer, int cb)
{
if (cb < 2)
return ANSI;
UINT16 *pVal16 = (UINT16 *)pBuffer;
UINT8 * pVal8 = (UINT8 *)(pVal16 + 1);
// scan the whole buffer for a 0x0000 sequence
// if found, we assume a binary file
int nNull = 0;
int nDblNull = 0;
for (int i = 0; i < (cb - 2); i = i + 2)
{
if (0x0000 == *pVal16++)
++nDblNull;
if (0x00 == *pVal8++)
++nNull;
if (0x00 == *pVal8++)
++nNull;
}
if (nDblNull > 2) // arbitrary value: allow two double null chars to account for 'broken' text files
return BINARY;
pVal16 = (UINT16 *)pBuffer;
pVal8 = (UINT8 *)(pVal16 + 1);
if (*pVal16 == 0xFEFF)
return UNICODE_LE;
if (*pVal16 == 0xFFFE)
return UNICODE_BE;
if ((nNull > 3) && ((cb % 2) == 0)) // arbitrary value: allow three null chars to account for 'broken' text files
return UNICODE_LE;
if (cb < 3)
return ANSI;
if (*pVal16 == 0xBBEF)
{
if (*pVal8 == 0xBF)
return UTF8;
}
// check for illegal UTF8 chars
pVal8 = (UINT8 *)pBuffer;
for (int i = 0; i < cb; ++i)
{
if ((*pVal8 == 0xC0) || (*pVal8 == 0xC1) || (*pVal8 >= 0xF5))
return ANSI;
pVal8++;
}
pVal8 = (UINT8 *)pBuffer;
bool bUTF8 = false;
for (int i = 0; i < (cb - 4); ++i)
{
if ((*pVal8 & 0xE0) == 0xC0)
{
pVal8++;
i++;
if ((*pVal8 & 0xC0) != 0x80)
return ANSI;
bUTF8 = true;
}
if ((*pVal8 & 0xF0) == 0xE0)
{
pVal8++;
i++;
if ((*pVal8 & 0xC0) != 0x80)
return ANSI;
pVal8++;
i++;
if ((*pVal8 & 0xC0) != 0x80)
return ANSI;
bUTF8 = true;
}
if ((*pVal8 & 0xF8) == 0xF0)
{
pVal8++;
i++;
if ((*pVal8 & 0xC0) != 0x80)
return ANSI;
pVal8++;
i++;
if ((*pVal8 & 0xC0) != 0x80)
return ANSI;
pVal8++;
i++;
if ((*pVal8 & 0xC0) != 0x80)
return ANSI;
bUTF8 = true;
}
pVal8++;
}
if (bUTF8)
return UTF8;
return ANSI;
}
bool CTextFile::CalculateLines(volatile LONG * bCancelled)
{
// fill an array with starting positions for every line in the loaded file
if (pFileBuf == NULL)
return false;
if (textcontent.empty())
return true;
linepositions.clear();
linepositions.reserve(textcontent.size() / 10);
size_t pos = 0;
for (auto it = textcontent.begin(); it != textcontent.end() && ((bCancelled == nullptr) || !InterlockedExchangeAdd(bCancelled, 0)); ++it)
{
if (*it == '\r')
{
++it;
++pos;
if (it != textcontent.end())
{
if (*it == '\n')
{
// crlf lineending
linepositions.push_back(pos);
}
else
{
// cr lineending
linepositions.push_back(pos - 1);
}
}
else
break;
}
else if (*it == '\n')
{
// lf lineending
linepositions.push_back(pos);
}
++pos;
}
linepositions.push_back(pos);
return true;
}
long CTextFile::LineFromPosition(long pos) const
{
auto lb = std::lower_bound(linepositions.begin(), linepositions.end(), static_cast<size_t>(pos));
auto lbLine = lb - linepositions.begin();
return long(lbLine + 1);
}
std::wstring CTextFile::GetLineString(long lineNumber) const
{
if ((lineNumber - 2) >= (long)linepositions.size())
return std::wstring();
if (lineNumber <= 0)
return std::wstring();
long startpos = 0;
if (lineNumber > 1)
startpos = (long)linepositions[lineNumber - 2] + 1;
std::wstring endchars(L"\n\0", 2);
size_t endpos = textcontent.find_first_of(endchars, startpos);
std::wstring line;
if (endpos != std::wstring::npos)
line = std::wstring(textcontent.begin() + startpos, textcontent.begin() + endpos);
else
line = std::wstring(textcontent.begin() + startpos, textcontent.end());
return line;
}
std::wstring CTextFile::GetFileNameWithoutExtension()
{
return CPathUtils::GetFileNameWithoutExtension(filename);
}
std::wstring CTextFile::GetFileNameExtension()
{
return CPathUtils::GetFileExtension(filename);
}