From 3985f999b5b447f015bc6575a5e9bf5ec0ff43af Mon Sep 17 00:00:00 2001 From: "METANEOCORTEX\\Kotti" Date: Mon, 26 Feb 2024 02:04:26 +0100 Subject: [PATCH] +fix: regex dot(.) not matching linefeed(LF)/newline character, if not enabled for line-breaks --- .../scintilla/OnigurumaRegExEngine.cxx | 10 +-- scintilla/oniguruma/src/ascii.c | 53 +++++++------ scintilla/oniguruma/src/config.h | 37 +-------- scintilla/oniguruma/src/oniguruma.h | 4 - scintilla/oniguruma/src/regenc.c | 17 +--- scintilla/oniguruma/src/regenc.h | 5 +- scintilla/oniguruma/src/utf8.c | 77 +++++++++++-------- 7 files changed, 79 insertions(+), 124 deletions(-) diff --git a/scintilla/oniguruma/scintilla/OnigurumaRegExEngine.cxx b/scintilla/oniguruma/scintilla/OnigurumaRegExEngine.cxx index 95d5fa460..f4f15b3d4 100644 --- a/scintilla/oniguruma/scintilla/OnigurumaRegExEngine.cxx +++ b/scintilla/oniguruma/scintilla/OnigurumaRegExEngine.cxx @@ -64,7 +64,7 @@ using namespace Scintilla::Internal; enum class EOLmode : int { UDEF = -1, CRLF = SC_EOL_CRLF, CR = SC_EOL_CR, LF = SC_EOL_LF }; -static OnigEncoding s_UsedEncodingsTypes[] = { ONIG_ENCODING_UTF8, ONIG_ENCODING_UTF8_CR }; +static OnigEncoding s_UsedEncodingsTypes[] = { ONIG_ENCODING_UTF8 }; // ============================================================================ // ============================================================================ @@ -328,11 +328,9 @@ Sci::Position OnigurumaRegExEngine::FindText(Document* doc, Sci::Position minPos try { - OnigEncoding const onigEncType = ((eolMode == EOLmode::CR) ? ONIG_ENCODING_UTF8_CR : ONIG_ENCODING_UTF8); - OnigErrorInfo einfo; int const res = onig_new(&m_RegExpr, UCharCPtr(m_RegExprStrg.c_str()), UCharCPtr(m_RegExprStrg.c_str() + m_RegExprStrg.length()), - m_CmplOptions, onigEncType, &m_OnigSyntax, &einfo); + m_CmplOptions, ONIG_ENCODING_UTF8, &m_OnigSyntax, &einfo); if (res != ONIG_NORMAL) { onig_error_code_to_str(UCharPtr(m_ErrorInfo), res, &einfo); @@ -795,10 +793,8 @@ OnigPos SimpleRegExEngine::Find(const OnigUChar* pattern, const OnigUChar* docum try { onig_free(m_RegExpr); - OnigEncoding const onigEncType = ((m_EOLmode == EOLmode::CR) ? ONIG_ENCODING_UTF8_CR : ONIG_ENCODING_UTF8); - OnigErrorInfo einfo; - int res = onig_new(&m_RegExpr, pattern, (pattern + patternLen), m_Options, onigEncType, &m_OnigSyntax, &einfo); + int res = onig_new(&m_RegExpr, pattern, (pattern + patternLen), m_Options, ONIG_ENCODING_UTF8, &m_OnigSyntax, &einfo); if (res != ONIG_NORMAL) { //onig_error_code_to_str(m_ErrorInfo, res, &einfo); diff --git a/scintilla/oniguruma/src/ascii.c b/scintilla/oniguruma/src/ascii.c index c12828bff..b8235f8a7 100644 --- a/scintilla/oniguruma/src/ascii.c +++ b/scintilla/oniguruma/src/ascii.c @@ -93,12 +93,37 @@ ascii_is_code_ctype(OnigCodePoint code, unsigned int ctype) return FALSE; } +static int +ascii_is_newline(const UChar *p, const UChar *end) { +#ifdef USE_CRNL_AS_LINE_TERMINATOR + if (p + 1 < end) { + if ((*p == CARRIAGE_RET) && (*(p+1) == NEWLINE_CODE)) + return 1; + } +#endif + if (p < end) { +#ifdef USE_END_OF_FILE_AS_LINE_TERMINATOR + if ((*p == CARRIAGE_RET) || (*p == NEWLINE_CODE) || (*p == END_OF_FILE)) + return 1; +#else + if ((*p == CARRIAGE_RET) || (*p == NEWLINE_CODE)) + return 1; +#endif + } +#ifdef USE_END_OF_FILE_AS_LINE_TERMINATOR + if (p == end) + return 1; +#endif + return 0; +} + + OnigEncodingType OnigEncodingASCII = { onigenc_single_byte_mbc_enc_len, "US-ASCII", /* name */ 1, /* max enc length */ 1, /* min enc length */ - onigenc_is_mbc_newline_0x0a, + ascii_is_newline, onigenc_single_byte_mbc_to_code, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, @@ -116,29 +141,3 @@ OnigEncodingType OnigEncodingASCII = { ENC_FLAG_ASCII_COMPATIBLE|ENC_FLAG_SKIP_OFFSET_1, 0, 0 }; - - -OnigEncodingType OnigEncodingASCII_CR = { - onigenc_single_byte_mbc_enc_len, - "US-ASCII", /* name */ - 1, /* max enc length */ - 1, /* min enc length */ - onigenc_is_mbc_newline_0x0d, - onigenc_single_byte_mbc_to_code, - onigenc_single_byte_code_to_mbclen, - onigenc_single_byte_code_to_mbc, - onigenc_ascii_mbc_case_fold, - onigenc_ascii_apply_all_case_fold, - onigenc_ascii_get_case_fold_codes_by_str, - onigenc_minimum_property_name_to_ctype, - ascii_is_code_ctype, - onigenc_not_support_get_ctype_code_range, - onigenc_single_byte_left_adjust_char_head, - onigenc_always_true_is_allowed_reverse_match, - init, - 0, /* is_initialized */ - onigenc_always_true_is_valid_mbc_string, - ENC_FLAG_ASCII_COMPATIBLE|ENC_FLAG_SKIP_OFFSET_1, - 0, 0 -}; - diff --git a/scintilla/oniguruma/src/config.h b/scintilla/oniguruma/src/config.h index b2bc793c9..13c7d1c61 100644 --- a/scintilla/oniguruma/src/config.h +++ b/scintilla/oniguruma/src/config.h @@ -4,16 +4,12 @@ #ifndef _ONIGURUMA_CONFIG_H_ #define _ONIGURUMA_CONFIG_H_ -#define STDC_HEADERS 1 +#define HAVE_INTTYPES_H 1 #define HAVE_SYS_TYPES_H 1 #define HAVE_SYS_STAT_H 1 -#define HAVE_STDLIB_H 1 -#define HAVE_STRING_H 1 #define HAVE_MEMORY_H 1 -#define HAVE_FLOAT_H 1 #define HAVE_OFF_T 1 #define SIZEOF_INT 4 -#define SIZEOF_SHORT 2 #define SIZEOF_LONG 4 #define SIZEOF_LONG_LONG 8 #define SIZEOF___INT64 8 @@ -27,9 +23,7 @@ #endif #define SIZEOF_FLOAT 4 #define SIZEOF_DOUBLE 8 -#define HAVE_PROTOTYPES 1 #define TOKEN_PASTE(x,y) x##y -#define HAVE_STDARG_PROTOTYPES 1 #ifndef NORETURN #if _MSC_VER > 1100 #define NORETURN(x) __declspec(noreturn) x @@ -38,53 +32,24 @@ #endif #endif #define HAVE_DECL_SYS_NERR 1 -#define STDC_HEADERS 1 -#define HAVE_STDINT_H 1 -#define HAVE_STDLIB_H 1 -#define HAVE_STRING_H 1 -#define HAVE_LIMITS_H 1 #define HAVE_FCNTL_H 1 #define HAVE_SYS_UTIME_H 1 #define HAVE_MEMORY_H 1 #define uid_t int #define gid_t int -#define HAVE_STRUCT_STAT_ST_RDEV 1 -#define HAVE_ST_RDEV 1 #define GETGROUPS_T int -#define RETSIGTYPE void #define HAVE_ALLOCA 1 #define HAVE_DUP2 1 -#define HAVE_MEMCMP 1 -#define HAVE_MEMMOVE 1 #define HAVE_MKDIR 1 -#define HAVE_STRCASECMP 1 -#define HAVE_STRNCASECMP 1 -#define HAVE_STRERROR 1 -#define HAVE_STRFTIME 1 -#define HAVE_STRCHR 1 -#define HAVE_STRSTR 1 -#define HAVE_STRTOD 1 -#define HAVE_STRTOL 1 -#define HAVE_STRTOUL 1 #define HAVE_FLOCK 1 -#define HAVE_VSNPRINTF 1 #define HAVE_FINITE 1 -#define HAVE_FMOD 1 -#define HAVE_FREXP 1 #define HAVE_HYPOT 1 -#define HAVE_MODF 1 #define HAVE_WAITPID 1 #define HAVE_CHSIZE 1 #define HAVE_TIMES 1 -#define HAVE__SETJMP 1 #define HAVE_TELLDIR 1 #define HAVE_SEEKDIR 1 -#define HAVE_MKTIME 1 -#define HAVE_COSH 1 -#define HAVE_SINH 1 -#define HAVE_TANH 1 #define HAVE_EXECVE 1 -#define HAVE_TZNAME 1 #define HAVE_DAYLIGHT 1 #define SETPGRP_VOID 1 #define inline __inline diff --git a/scintilla/oniguruma/src/oniguruma.h b/scintilla/oniguruma/src/oniguruma.h index 11be2b843..ba4cfb136 100644 --- a/scintilla/oniguruma/src/oniguruma.h +++ b/scintilla/oniguruma/src/oniguruma.h @@ -189,9 +189,7 @@ ONIG_EXTERN OnigEncodingType OnigEncodingBIG5; ONIG_EXTERN OnigEncodingType OnigEncodingGB18030; #else // lean and mean ONIG_EXTERN OnigEncodingType OnigEncodingASCII; -ONIG_EXTERN OnigEncodingType OnigEncodingASCII_CR; ONIG_EXTERN OnigEncodingType OnigEncodingUTF8; -ONIG_EXTERN OnigEncodingType OnigEncodingUTF8_CR; #endif #if 0 @@ -228,9 +226,7 @@ ONIG_EXTERN OnigEncodingType OnigEncodingUTF8_CR; #define ONIG_ENCODING_GB18030 (&OnigEncodingGB18030) #else // lean and mean #define ONIG_ENCODING_ASCII (&OnigEncodingASCII) -#define ONIG_ENCODING_ASCII_CR (&OnigEncodingASCII_CR) #define ONIG_ENCODING_UTF8 (&OnigEncodingUTF8) -#define ONIG_ENCODING_UTF8_CR (&OnigEncodingUTF8_CR) #endif #define ONIG_ENCODING_UNDEF ((OnigEncoding )0) diff --git a/scintilla/oniguruma/src/regenc.c b/scintilla/oniguruma/src/regenc.c index f5c60ab96..033a6a344 100644 --- a/scintilla/oniguruma/src/regenc.c +++ b/scintilla/oniguruma/src/regenc.c @@ -697,6 +697,7 @@ onigenc_not_support_get_ctype_code_range(OnigCtype ctype ARG_UNUSED, return ONIG_NO_SUPPORT_CONFIG; } + extern int onigenc_is_mbc_newline_0x0a(const UChar* p, const UChar* end) { @@ -714,22 +715,6 @@ onigenc_is_mbc_newline_0x0a(const UChar* p, const UChar* end) return 0; } -extern int -onigenc_is_mbc_newline_0x0d(const UChar* p, const UChar* end) -{ - if (p < end) { -#ifdef USE_END_OF_FILE_AS_LINE_TERMINATOR - if ((*p == CARRIAGE_RET)||(*p == END_OF_FILE)) return 1; // CR -#else - if (*p == CARRIAGE_RET) return 1; // CR -#endif - } -#ifdef USE_END_OF_FILE_AS_LINE_TERMINATOR - if (p == end) - return 1; -#endif - return 0; -} /* for single byte encodings */ extern int diff --git a/scintilla/oniguruma/src/regenc.h b/scintilla/oniguruma/src/regenc.h index 5aab14df4..d58f724f6 100644 --- a/scintilla/oniguruma/src/regenc.h +++ b/scintilla/oniguruma/src/regenc.h @@ -120,11 +120,12 @@ struct PropertyNameCtype { #define USE_END_OF_FILE_AS_LINE_TERMINATOR #define USE_CRNL_AS_LINE_TERMINATOR + #define USE_UNICODE_PROPERTIES #define USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER #define USE_UNICODE_WORD_BREAK /* #define USE_UNICODE_CASE_FOLD_TURKISH_AZERI */ -/* #define USE_UNICODE_ALL_LINE_TERMINATORS */ /* see Unicode.org UTS #18 */ +#define USE_UNICODE_ALL_LINE_TERMINATORS /* see Unicode.org UTS #18 */ //~#define ONIG_ENCODING_INIT_DEFAULT ONIG_ENCODING_ASCII @@ -159,7 +160,7 @@ extern int onigenc_apply_all_case_fold_with_map P_((int map_size, const OnigPair extern int onigenc_get_case_fold_codes_by_str_with_map P_((int map_size, const OnigPairCaseFoldCodes map[], int ess_tsett_flag, OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])); extern int onigenc_not_support_get_ctype_code_range P_((OnigCtype ctype, OnigCodePoint* sb_out, const OnigCodePoint* ranges[])); extern int onigenc_is_mbc_newline_0x0a P_((const UChar* p, const UChar* end)); -extern int onigenc_is_mbc_newline_0x0d P_((const UChar* p, const UChar* end)); + /* methods for single byte encoding */ extern int onigenc_ascii_mbc_case_fold P_((OnigCaseFoldType flag, const UChar** p, const UChar* end, UChar* lower)); diff --git a/scintilla/oniguruma/src/utf8.c b/scintilla/oniguruma/src/utf8.c index a19291bd2..34fb5cee2 100644 --- a/scintilla/oniguruma/src/utf8.c +++ b/scintilla/oniguruma/src/utf8.c @@ -261,6 +261,50 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag, flag, p, end, items); } +static int +is_utf8_newline(const UChar *p, const UChar *end) +{ +#ifdef USE_CRNL_AS_LINE_TERMINATOR + if (p + 1 < end) { + if ((*p == CARRIAGE_RET) && (*(p+1) == NEWLINE_CODE)) // CRLF + return 1; + } +#endif + +#ifdef USE_UNICODE_ALL_LINE_TERMINATORS + if (p + 2 < end) { + if ((*p == 0xe2) && (*(p+1) == 0x80) && ((*(p+2) == 0xa8) || (*(p+2) == 0xa9))) // LS or PS + return 1; + } + if (p + 1 < end) { + if ((*p == 0xc2) && (*(p+1) == 0x85)) // NEL + return 1; + } +#endif + + if (p < end) { +#ifdef USE_END_OF_FILE_AS_LINE_TERMINATOR + if ((*p == CARRIAGE_RET) || (*p == NEWLINE_CODE) || (*p == END_OF_FILE)) + return 1; +#else + if ((*p == CARRIAGE_RET) || (*p == NEWLINE_CODE)) + return 1; +#endif +#ifdef USE_UNICODE_ALL_LINE_TERMINATORS + if ((*p == 0x0b) || (*p == 0x0c)) // VT or FF + return 1; +#endif + } + +#ifdef USE_END_OF_FILE_AS_LINE_TERMINATOR + if (p == end) + return 1; +#endif + + return 0; +} + + OnigEncodingType OnigEncodingUTF8 = { mbc_enc_len, "UTF-8", /* name */ @@ -270,7 +314,7 @@ OnigEncodingType OnigEncodingUTF8 = { 6, #endif 1, /* min enc length */ - onigenc_is_mbc_newline_0x0a, + is_utf8_newline, mbc_to_code, code_to_mbclen, code_to_mbc, @@ -288,34 +332,3 @@ OnigEncodingType OnigEncodingUTF8 = { ENC_FLAG_ASCII_COMPATIBLE|ENC_FLAG_UNICODE|ENC_FLAG_SKIP_OFFSET_1_OR_0, 0, 0 }; - - -OnigEncodingType OnigEncodingUTF8_CR = { - mbc_enc_len, - "UTF-8", /* name */ -#ifdef USE_RFC3629_RANGE - 4, /* max enc length */ -#else - 6, -#endif - 1, /* min enc length */ - //is_mbc_newline, - onigenc_is_mbc_newline_0x0d, - mbc_to_code, - code_to_mbclen, - code_to_mbc, - mbc_case_fold, - onigenc_unicode_apply_all_case_fold, - get_case_fold_codes_by_str, - onigenc_unicode_property_name_to_ctype, - onigenc_unicode_is_code_ctype, - get_ctype_code_range, - left_adjust_char_head, - onigenc_always_true_is_allowed_reverse_match, - NULL, /* init */ - NULL, /* is_initialized */ - is_valid_mbc_string, - ENC_FLAG_ASCII_COMPATIBLE|ENC_FLAG_UNICODE|ENC_FLAG_SKIP_OFFSET_1_OR_0, - 0, 0 -}; -