+fix: regex dot(.) not matching linefeed(LF)/newline character, if not enabled for line-breaks

This commit is contained in:
METANEOCORTEX\Kotti 2024-02-26 02:04:26 +01:00
parent 7b35d58af3
commit 3985f999b5
7 changed files with 79 additions and 124 deletions

View File

@ -64,7 +64,7 @@ using namespace Scintilla::Internal;
enum class EOLmode : int { UDEF = -1, CRLF = SC_EOL_CRLF, CR = SC_EOL_CR, LF = SC_EOL_LF };
static OnigEncoding s_UsedEncodingsTypes[] = { ONIG_ENCODING_UTF8, ONIG_ENCODING_UTF8_CR };
static OnigEncoding s_UsedEncodingsTypes[] = { ONIG_ENCODING_UTF8 };
// ============================================================================
// ============================================================================
@ -328,11 +328,9 @@ Sci::Position OnigurumaRegExEngine::FindText(Document* doc, Sci::Position minPos
try {
OnigEncoding const onigEncType = ((eolMode == EOLmode::CR) ? ONIG_ENCODING_UTF8_CR : ONIG_ENCODING_UTF8);
OnigErrorInfo einfo;
int const res = onig_new(&m_RegExpr, UCharCPtr(m_RegExprStrg.c_str()), UCharCPtr(m_RegExprStrg.c_str() + m_RegExprStrg.length()),
m_CmplOptions, onigEncType, &m_OnigSyntax, &einfo);
m_CmplOptions, ONIG_ENCODING_UTF8, &m_OnigSyntax, &einfo);
if (res != ONIG_NORMAL) {
onig_error_code_to_str(UCharPtr(m_ErrorInfo), res, &einfo);
@ -795,10 +793,8 @@ OnigPos SimpleRegExEngine::Find(const OnigUChar* pattern, const OnigUChar* docum
try {
onig_free(m_RegExpr);
OnigEncoding const onigEncType = ((m_EOLmode == EOLmode::CR) ? ONIG_ENCODING_UTF8_CR : ONIG_ENCODING_UTF8);
OnigErrorInfo einfo;
int res = onig_new(&m_RegExpr, pattern, (pattern + patternLen), m_Options, onigEncType, &m_OnigSyntax, &einfo);
int res = onig_new(&m_RegExpr, pattern, (pattern + patternLen), m_Options, ONIG_ENCODING_UTF8, &m_OnigSyntax, &einfo);
if (res != ONIG_NORMAL) {
//onig_error_code_to_str(m_ErrorInfo, res, &einfo);

View File

@ -93,12 +93,37 @@ ascii_is_code_ctype(OnigCodePoint code, unsigned int ctype)
return FALSE;
}
static int
ascii_is_newline(const UChar *p, const UChar *end) {
#ifdef USE_CRNL_AS_LINE_TERMINATOR
if (p + 1 < end) {
if ((*p == CARRIAGE_RET) && (*(p+1) == NEWLINE_CODE))
return 1;
}
#endif
if (p < end) {
#ifdef USE_END_OF_FILE_AS_LINE_TERMINATOR
if ((*p == CARRIAGE_RET) || (*p == NEWLINE_CODE) || (*p == END_OF_FILE))
return 1;
#else
if ((*p == CARRIAGE_RET) || (*p == NEWLINE_CODE))
return 1;
#endif
}
#ifdef USE_END_OF_FILE_AS_LINE_TERMINATOR
if (p == end)
return 1;
#endif
return 0;
}
OnigEncodingType OnigEncodingASCII = {
onigenc_single_byte_mbc_enc_len,
"US-ASCII", /* name */
1, /* max enc length */
1, /* min enc length */
onigenc_is_mbc_newline_0x0a,
ascii_is_newline,
onigenc_single_byte_mbc_to_code,
onigenc_single_byte_code_to_mbclen,
onigenc_single_byte_code_to_mbc,
@ -116,29 +141,3 @@ OnigEncodingType OnigEncodingASCII = {
ENC_FLAG_ASCII_COMPATIBLE|ENC_FLAG_SKIP_OFFSET_1,
0, 0
};
OnigEncodingType OnigEncodingASCII_CR = {
onigenc_single_byte_mbc_enc_len,
"US-ASCII", /* name */
1, /* max enc length */
1, /* min enc length */
onigenc_is_mbc_newline_0x0d,
onigenc_single_byte_mbc_to_code,
onigenc_single_byte_code_to_mbclen,
onigenc_single_byte_code_to_mbc,
onigenc_ascii_mbc_case_fold,
onigenc_ascii_apply_all_case_fold,
onigenc_ascii_get_case_fold_codes_by_str,
onigenc_minimum_property_name_to_ctype,
ascii_is_code_ctype,
onigenc_not_support_get_ctype_code_range,
onigenc_single_byte_left_adjust_char_head,
onigenc_always_true_is_allowed_reverse_match,
init,
0, /* is_initialized */
onigenc_always_true_is_valid_mbc_string,
ENC_FLAG_ASCII_COMPATIBLE|ENC_FLAG_SKIP_OFFSET_1,
0, 0
};

View File

@ -4,16 +4,12 @@
#ifndef _ONIGURUMA_CONFIG_H_
#define _ONIGURUMA_CONFIG_H_
#define STDC_HEADERS 1
#define HAVE_INTTYPES_H 1
#define HAVE_SYS_TYPES_H 1
#define HAVE_SYS_STAT_H 1
#define HAVE_STDLIB_H 1
#define HAVE_STRING_H 1
#define HAVE_MEMORY_H 1
#define HAVE_FLOAT_H 1
#define HAVE_OFF_T 1
#define SIZEOF_INT 4
#define SIZEOF_SHORT 2
#define SIZEOF_LONG 4
#define SIZEOF_LONG_LONG 8
#define SIZEOF___INT64 8
@ -27,9 +23,7 @@
#endif
#define SIZEOF_FLOAT 4
#define SIZEOF_DOUBLE 8
#define HAVE_PROTOTYPES 1
#define TOKEN_PASTE(x,y) x##y
#define HAVE_STDARG_PROTOTYPES 1
#ifndef NORETURN
#if _MSC_VER > 1100
#define NORETURN(x) __declspec(noreturn) x
@ -38,53 +32,24 @@
#endif
#endif
#define HAVE_DECL_SYS_NERR 1
#define STDC_HEADERS 1
#define HAVE_STDINT_H 1
#define HAVE_STDLIB_H 1
#define HAVE_STRING_H 1
#define HAVE_LIMITS_H 1
#define HAVE_FCNTL_H 1
#define HAVE_SYS_UTIME_H 1
#define HAVE_MEMORY_H 1
#define uid_t int
#define gid_t int
#define HAVE_STRUCT_STAT_ST_RDEV 1
#define HAVE_ST_RDEV 1
#define GETGROUPS_T int
#define RETSIGTYPE void
#define HAVE_ALLOCA 1
#define HAVE_DUP2 1
#define HAVE_MEMCMP 1
#define HAVE_MEMMOVE 1
#define HAVE_MKDIR 1
#define HAVE_STRCASECMP 1
#define HAVE_STRNCASECMP 1
#define HAVE_STRERROR 1
#define HAVE_STRFTIME 1
#define HAVE_STRCHR 1
#define HAVE_STRSTR 1
#define HAVE_STRTOD 1
#define HAVE_STRTOL 1
#define HAVE_STRTOUL 1
#define HAVE_FLOCK 1
#define HAVE_VSNPRINTF 1
#define HAVE_FINITE 1
#define HAVE_FMOD 1
#define HAVE_FREXP 1
#define HAVE_HYPOT 1
#define HAVE_MODF 1
#define HAVE_WAITPID 1
#define HAVE_CHSIZE 1
#define HAVE_TIMES 1
#define HAVE__SETJMP 1
#define HAVE_TELLDIR 1
#define HAVE_SEEKDIR 1
#define HAVE_MKTIME 1
#define HAVE_COSH 1
#define HAVE_SINH 1
#define HAVE_TANH 1
#define HAVE_EXECVE 1
#define HAVE_TZNAME 1
#define HAVE_DAYLIGHT 1
#define SETPGRP_VOID 1
#define inline __inline

View File

@ -189,9 +189,7 @@ ONIG_EXTERN OnigEncodingType OnigEncodingBIG5;
ONIG_EXTERN OnigEncodingType OnigEncodingGB18030;
#else // lean and mean
ONIG_EXTERN OnigEncodingType OnigEncodingASCII;
ONIG_EXTERN OnigEncodingType OnigEncodingASCII_CR;
ONIG_EXTERN OnigEncodingType OnigEncodingUTF8;
ONIG_EXTERN OnigEncodingType OnigEncodingUTF8_CR;
#endif
#if 0
@ -228,9 +226,7 @@ ONIG_EXTERN OnigEncodingType OnigEncodingUTF8_CR;
#define ONIG_ENCODING_GB18030 (&OnigEncodingGB18030)
#else // lean and mean
#define ONIG_ENCODING_ASCII (&OnigEncodingASCII)
#define ONIG_ENCODING_ASCII_CR (&OnigEncodingASCII_CR)
#define ONIG_ENCODING_UTF8 (&OnigEncodingUTF8)
#define ONIG_ENCODING_UTF8_CR (&OnigEncodingUTF8_CR)
#endif
#define ONIG_ENCODING_UNDEF ((OnigEncoding )0)

View File

@ -697,6 +697,7 @@ onigenc_not_support_get_ctype_code_range(OnigCtype ctype ARG_UNUSED,
return ONIG_NO_SUPPORT_CONFIG;
}
extern int
onigenc_is_mbc_newline_0x0a(const UChar* p, const UChar* end)
{
@ -714,22 +715,6 @@ onigenc_is_mbc_newline_0x0a(const UChar* p, const UChar* end)
return 0;
}
extern int
onigenc_is_mbc_newline_0x0d(const UChar* p, const UChar* end)
{
if (p < end) {
#ifdef USE_END_OF_FILE_AS_LINE_TERMINATOR
if ((*p == CARRIAGE_RET)||(*p == END_OF_FILE)) return 1; // CR
#else
if (*p == CARRIAGE_RET) return 1; // CR
#endif
}
#ifdef USE_END_OF_FILE_AS_LINE_TERMINATOR
if (p == end)
return 1;
#endif
return 0;
}
/* for single byte encodings */
extern int

View File

@ -120,11 +120,12 @@ struct PropertyNameCtype {
#define USE_END_OF_FILE_AS_LINE_TERMINATOR
#define USE_CRNL_AS_LINE_TERMINATOR
#define USE_UNICODE_PROPERTIES
#define USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER
#define USE_UNICODE_WORD_BREAK
/* #define USE_UNICODE_CASE_FOLD_TURKISH_AZERI */
/* #define USE_UNICODE_ALL_LINE_TERMINATORS */ /* see Unicode.org UTS #18 */
#define USE_UNICODE_ALL_LINE_TERMINATORS /* see Unicode.org UTS #18 */
//~#define ONIG_ENCODING_INIT_DEFAULT ONIG_ENCODING_ASCII
@ -159,7 +160,7 @@ extern int onigenc_apply_all_case_fold_with_map P_((int map_size, const OnigPair
extern int onigenc_get_case_fold_codes_by_str_with_map P_((int map_size, const OnigPairCaseFoldCodes map[], int ess_tsett_flag, OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[]));
extern int onigenc_not_support_get_ctype_code_range P_((OnigCtype ctype, OnigCodePoint* sb_out, const OnigCodePoint* ranges[]));
extern int onigenc_is_mbc_newline_0x0a P_((const UChar* p, const UChar* end));
extern int onigenc_is_mbc_newline_0x0d P_((const UChar* p, const UChar* end));
/* methods for single byte encoding */
extern int onigenc_ascii_mbc_case_fold P_((OnigCaseFoldType flag, const UChar** p, const UChar* end, UChar* lower));

View File

@ -261,6 +261,50 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag,
flag, p, end, items);
}
static int
is_utf8_newline(const UChar *p, const UChar *end)
{
#ifdef USE_CRNL_AS_LINE_TERMINATOR
if (p + 1 < end) {
if ((*p == CARRIAGE_RET) && (*(p+1) == NEWLINE_CODE)) // CRLF
return 1;
}
#endif
#ifdef USE_UNICODE_ALL_LINE_TERMINATORS
if (p + 2 < end) {
if ((*p == 0xe2) && (*(p+1) == 0x80) && ((*(p+2) == 0xa8) || (*(p+2) == 0xa9))) // LS or PS
return 1;
}
if (p + 1 < end) {
if ((*p == 0xc2) && (*(p+1) == 0x85)) // NEL
return 1;
}
#endif
if (p < end) {
#ifdef USE_END_OF_FILE_AS_LINE_TERMINATOR
if ((*p == CARRIAGE_RET) || (*p == NEWLINE_CODE) || (*p == END_OF_FILE))
return 1;
#else
if ((*p == CARRIAGE_RET) || (*p == NEWLINE_CODE))
return 1;
#endif
#ifdef USE_UNICODE_ALL_LINE_TERMINATORS
if ((*p == 0x0b) || (*p == 0x0c)) // VT or FF
return 1;
#endif
}
#ifdef USE_END_OF_FILE_AS_LINE_TERMINATOR
if (p == end)
return 1;
#endif
return 0;
}
OnigEncodingType OnigEncodingUTF8 = {
mbc_enc_len,
"UTF-8", /* name */
@ -270,7 +314,7 @@ OnigEncodingType OnigEncodingUTF8 = {
6,
#endif
1, /* min enc length */
onigenc_is_mbc_newline_0x0a,
is_utf8_newline,
mbc_to_code,
code_to_mbclen,
code_to_mbc,
@ -288,34 +332,3 @@ OnigEncodingType OnigEncodingUTF8 = {
ENC_FLAG_ASCII_COMPATIBLE|ENC_FLAG_UNICODE|ENC_FLAG_SKIP_OFFSET_1_OR_0,
0, 0
};
OnigEncodingType OnigEncodingUTF8_CR = {
mbc_enc_len,
"UTF-8", /* name */
#ifdef USE_RFC3629_RANGE
4, /* max enc length */
#else
6,
#endif
1, /* min enc length */
//is_mbc_newline,
onigenc_is_mbc_newline_0x0d,
mbc_to_code,
code_to_mbclen,
code_to_mbc,
mbc_case_fold,
onigenc_unicode_apply_all_case_fold,
get_case_fold_codes_by_str,
onigenc_unicode_property_name_to_ctype,
onigenc_unicode_is_code_ctype,
get_ctype_code_range,
left_adjust_char_head,
onigenc_always_true_is_allowed_reverse_match,
NULL, /* init */
NULL, /* is_initialized */
is_valid_mbc_string,
ENC_FLAG_ASCII_COMPATIBLE|ENC_FLAG_UNICODE|ENC_FLAG_SKIP_OFFSET_1_OR_0,
0, 0
};