+ fix: Onigmo backward search

+ fix: use all ASCII line breaks for line termination ($) meta char
This commit is contained in:
Rainer Kottenhoff 2018-03-01 16:17:11 +01:00
parent cce69112ca
commit 85dd425893
3 changed files with 41 additions and 51 deletions

View File

@ -252,17 +252,21 @@ static int
is_mbc_newline(const UChar* p, const UChar* end, OnigEncoding enc)
{
if (p < end) {
if ((*p == 0x0a) || (*p == 0x0d)) return 1; // LF or CR
#ifdef USE_UNICODE_ALL_LINE_TERMINATORS
if (*p == 0x0b || *p == 0x0c || *p == 0x0d) return 1;
if (*p == 0x0a) return 1; // LF
#if defined(USE_ASCII_ALL_LINE_BREAKS) || defined(USE_UNICODE_ALL_LINE_TERMINATORS)
if (*p == 0x0b || *p == 0x0c || *p == 0x0d) return 1; // VT FF CR
#endif
#ifdef USE_UNICODE_ALL_LINE_TERMINATORS
if (p + 1 < end) {
if (*(p+1) == 0x85 && *p == 0xc2) /* U+0085 */
return 1;
if (*(p + 1) == 0x85 && *p == 0xc2) /* U+0085 */
return 1;
if (p + 2 < end) {
if ((*(p+2) == 0xa8 || *(p+2) == 0xa9)
&& *(p+1) == 0x80 && *p == 0xe2) /* U+2028, U+2029 */
return 1;
if ((*(p + 2) == 0xa8 || *(p + 2) == 0xa9)
&& *(p + 1) == 0x80 && *p == 0xe2) /* U+2028, U+2029 */
return 1;
}
}
#endif
@ -359,7 +363,7 @@ code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc ARG_UNUSED)
static int
mbc_case_fold(OnigCaseFoldType flag, const UChar** pp,
const UChar* end, UChar* fold, OnigEncoding enc)
const UChar* end, UChar* fold, OnigEncoding enc)
{
const UChar* p = *pp;
@ -367,10 +371,10 @@ mbc_case_fold(OnigCaseFoldType flag, const UChar** pp,
#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
if (*p == 0x49) {
*fold++ = 0xc4;
*fold = 0xb1;
(*pp)++;
return 2;
*fold++ = 0xc4;
*fold = 0xb1;
(*pp)++;
return 2;
}
}
#endif
@ -387,7 +391,7 @@ mbc_case_fold(OnigCaseFoldType flag, const UChar** pp,
static int
get_ctype_code_range(OnigCtype ctype, OnigCodePoint *sb_out,
const OnigCodePoint* ranges[], OnigEncoding enc ARG_UNUSED)
const OnigCodePoint* ranges[], OnigEncoding enc ARG_UNUSED)
{
*sb_out = 0x80;
return onigenc_unicode_ctype_code_range(ctype, ranges);

View File

@ -133,6 +133,7 @@ typedef struct {
#define USE_UNICODE_PROPERTIES
#define USE_UNICODE_AGE_PROPERTIES
/* #define USE_UNICODE_CASE_FOLD_TURKISH_AZERI */
#define USE_ASCII_ALL_LINE_BREAKS // LF, VT, FF, CR
/* #define USE_UNICODE_ALL_LINE_TERMINATORS */ /* see Unicode.org UTS #18 */
@ -187,8 +188,8 @@ ONIG_EXTERN int onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag, OnigA
#define UTF16_IS_SURROGATE_SECOND(c) (((c) & 0xfc) == 0xdc)
#define UTF16_IS_SURROGATE(c) (((c) & 0xf8) == 0xd8)
#define UNICODE_VALID_CODEPOINT_P(c) ( \
((c) <= 0x10ffff) && \
!((c) < 0x10000 && UTF16_IS_SURROGATE((c) >> 8)))
((c) <= 0x10ffff) && \
!((c) < 0x10000 && UTF16_IS_SURROGATE((c) >> 8)))
#define ONIGENC_ISO_8859_1_TO_LOWER_CASE(c) \
OnigEncISO_8859_1_ToLowerCaseTable[c]
@ -234,8 +235,8 @@ extern int ONIG_ENC_REGISTER(const char *, OnigEncoding);
# define OnigEncodingDefine(f,n) \
OnigEncodingDeclare(n); \
void Init_##f(void) { \
ONIG_ENC_REGISTER(OnigEncodingName(n).name, \
&OnigEncodingName(n)); \
ONIG_ENC_REGISTER(OnigEncodingName(n).name, \
&OnigEncodingName(n)); \
} \
OnigEncodingDeclare(n)
#else

View File

@ -205,7 +205,7 @@ static void replaceAll(std::string& source, const std::string& from, const std::
* Has not been tested with backwards DBCS searches yet.
*/
long OnigmoRegExEngine::FindText(Document* doc, Sci::Position minPos, Sci::Position maxPos, const char *pattern,
bool caseSensitive, bool word, bool wordStart, int searchFlags, Sci::Position *length)
bool caseSensitive, bool word, bool wordStart, int searchFlags, Sci::Position *length)
{
if (!(pattern && (strlen(pattern) > 0))) {
*length = 0;
@ -214,15 +214,18 @@ long OnigmoRegExEngine::FindText(Document* doc, Sci::Position minPos, Sci::Posit
Sci::Position docLen = SciPos(doc->Length());
const bool findForward = (minPos <= maxPos);
const int increment = findForward ? 1 : -1;
// Range endpoints should not be inside DBCS characters, but just in case, move them.
minPos = doc->MovePositionOutsideChar(minPos, 1, false);
maxPos = doc->MovePositionOutsideChar(maxPos, 1, false);
const bool findprevious = (minPos > maxPos);
Sci::Position rangeBeg = (findprevious) ? maxPos : minPos;
Sci::Position rangeEnd = (findprevious) ? minPos : maxPos;
minPos = doc->MovePositionOutsideChar(minPos, increment, false);
maxPos = doc->MovePositionOutsideChar(maxPos, increment, false);
Sci::Position rangeBeg = (findForward) ? minPos : maxPos;
Sci::Position rangeEnd = (findForward) ? maxPos : minPos;
Sci::Position rangeLen = (rangeEnd - rangeBeg);
// -----------------------------
// --- Onigmo Engine Options ---
// -----------------------------
@ -241,7 +244,7 @@ long OnigmoRegExEngine::FindText(Document* doc, Sci::Position minPos, Sci::Posit
else {
ONIG_OPTION_OFF(onigmoOptions, ONIG_OPTION_DOTALL);
}
//ONIG_OPTION_ON(onigmoOptions, ONIG_OPTION_SINGLELINE);
ONIG_OPTION_ON(onigmoOptions, ONIG_OPTION_NEGATE_SINGLELINE);
@ -257,8 +260,7 @@ long OnigmoRegExEngine::FindText(Document* doc, Sci::Position minPos, Sci::Posit
bool bReCompile = (m_RegExpr == nullptr) || (m_CmplOptions != onigmoOptions) || (m_RegExprStrg.compare(sRegExprStrg) != 0);
if (bReCompile)
{
if (bReCompile) {
m_RegExprStrg.clear();
m_RegExprStrg = sRegExprStrg;
m_CmplOptions = onigmoOptions;
@ -289,12 +291,14 @@ long OnigmoRegExEngine::FindText(Document* doc, Sci::Position minPos, Sci::Posit
UChar* docBegPtr = (UChar*)doc->RangePointer(0, docLen);
UChar* docSEndPtr = (UChar*)doc->RangePointer(docLen, 0);
UChar* rangeBegPtr = (UChar*)doc->RangePointer(rangeBeg, rangeLen);
UChar* rangeEndPtr = (UChar*)doc->RangePointer(rangeEnd, rangeLen);
UChar* rangeEndPtr = (UChar*)doc->RangePointer(rangeEnd, 0);
OnigPosition result = ONIG_MISMATCH;
try {
result = onig_search(m_RegExpr, docBegPtr, docSEndPtr, rangeBegPtr, rangeEndPtr, &m_Region, onigmoOptions);
if (findForward)
result = onig_search(m_RegExpr, docBegPtr, docSEndPtr, rangeBegPtr, rangeEndPtr, &m_Region, onigmoOptions);
else // X //
result = onig_search(m_RegExpr, docBegPtr, docSEndPtr, rangeEndPtr, rangeBegPtr, &m_Region, onigmoOptions);
}
catch (...) {
return Cast2long(-3); // -1 is normally used for not found, -3 is used here for exception
@ -305,26 +309,7 @@ long OnigmoRegExEngine::FindText(Document* doc, Sci::Position minPos, Sci::Posit
return Cast2long(-3);
}
if (findprevious) // search for last occurrence in range
{
//SPEEDUP: onig_scan() ???
while ((result >= 0) && (rangeBegPtr <= rangeEndPtr))
{
m_MatchPos = SciPos(result); //SciPos(m_Region.beg[0]);
m_MatchLen = SciPos(m_Region.end[0] - result);
rangeBegPtr = docBegPtr + (m_MatchPos + max(1,m_MatchLen));
try {
result = onig_search(m_RegExpr, docBegPtr, docSEndPtr, rangeBegPtr, rangeEndPtr, &m_Region, onigmoOptions);
}
catch (...) {
return Cast2long(-3);
}
}
}
else if ((result >= 0) && (rangeBegPtr <= rangeEndPtr))
if ((result >= 0) && (rangeBegPtr <= rangeEndPtr))
{
m_MatchPos = SciPos(result); //SciPos(m_Region.beg[0]);
m_MatchLen = SciPos(m_Region.end[0] - result);