+ Integration of DeelX RegExpr engine ( see http://www.regexlab.com/en/deelx/ )

Notepad2 and also the maintaining fork notepad2-mod are using Scintilla's internal regexpr engine, which has its limitations ( see  XhmikosR/notepad2-mod#148 ).
   In wise forsight, the developer of Scintilla creates an interface (activated by preprocessor define SCI_OWNREGEX), to embed your own RegExpr search (and replace) engine.
This commit is contained in:
Rainer Kottenhoff 2016-11-27 13:25:39 +01:00
parent 3dcab849aa
commit 1d526576c9
11 changed files with 15098 additions and 6 deletions

View File

@ -121,7 +121,7 @@
<MultiProcessorCompilation>true</MultiProcessorCompilation>
<Optimization>Disabled</Optimization>
<PrecompiledHeader>NotUsing</PrecompiledHeader>
<PreprocessorDefinitions>_SCL_SECURE_NO_WARNINGS;WIN32;_DEBUG;_WINDOWS;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;STATIC_BUILD;SCI_LEXER;USE_D2D;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<PreprocessorDefinitions>_SCL_SECURE_NO_WARNINGS;WIN32;SCI_OWNREGEX;_DEBUG;_WINDOWS;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;STATIC_BUILD;SCI_LEXER;USE_D2D;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
<WarningLevel>Level3</WarningLevel>
</ClCompile>
@ -134,7 +134,7 @@
<MultiProcessorCompilation>true</MultiProcessorCompilation>
<Optimization>Disabled</Optimization>
<PrecompiledHeader>NotUsing</PrecompiledHeader>
<PreprocessorDefinitions>_SCL_SECURE_NO_WARNINGS;_WIN64;_DEBUG;_WINDOWS;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;STATIC_BUILD;SCI_LEXER;USE_D2D;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<PreprocessorDefinitions>_SCL_SECURE_NO_WARNINGS;_WIN64;SCI_OWNREGEX;_DEBUG;_WINDOWS;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;STATIC_BUILD;SCI_LEXER;USE_D2D;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
<WarningLevel>Level3</WarningLevel>
</ClCompile>
@ -150,7 +150,7 @@
<MultiProcessorCompilation>true</MultiProcessorCompilation>
<Optimization>MaxSpeed</Optimization>
<PrecompiledHeader>NotUsing</PrecompiledHeader>
<PreprocessorDefinitions>_SCL_SECURE_NO_WARNINGS;WIN32;NDEBUG;_WINDOWS;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;STATIC_BUILD;SCI_LEXER;USE_D2D;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<PreprocessorDefinitions>_SCL_SECURE_NO_WARNINGS;WIN32;SCI_OWNREGEX;NDEBUG;_WINDOWS;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;STATIC_BUILD;SCI_LEXER;USE_D2D;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
<WarningLevel>Level3</WarningLevel>
</ClCompile>
@ -162,7 +162,7 @@
<MultiProcessorCompilation>true</MultiProcessorCompilation>
<Optimization>MaxSpeed</Optimization>
<PrecompiledHeader>NotUsing</PrecompiledHeader>
<PreprocessorDefinitions>_SCL_SECURE_NO_WARNINGS;_WIN64;NDEBUG;_WINDOWS;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;STATIC_BUILD;SCI_LEXER;USE_D2D;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<PreprocessorDefinitions>_SCL_SECURE_NO_WARNINGS;_WIN64;SCI_OWNREGEX;NDEBUG;_WINDOWS;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;STATIC_BUILD;SCI_LEXER;USE_D2D;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
<WarningLevel>Level3</WarningLevel>
</ClCompile>
@ -171,6 +171,7 @@
</Lib>
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="deelx\DeelxRegexSearch.cxx" />
<ClCompile Include="lexers\LexAHK.cxx" />
<ClCompile Include="lexers\LexAsm.cxx" />
<ClCompile Include="lexers\LexAU3.cxx" />
@ -249,6 +250,7 @@
<ClCompile Include="win32\ScintillaWin.cxx" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="deelx\deelx64.h" />
<ClInclude Include="include\ILexer.h" />
<ClInclude Include="include\Platform.h" />
<ClInclude Include="include\SciLexer.h" />
@ -304,6 +306,9 @@
<ClInclude Include="src\XPM.h" />
<ClInclude Include="win32\PlatWin.h" />
</ItemGroup>
<ItemGroup>
<None Include="deelx\doc\deelx_en.chm" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
</ImportGroup>

View File

@ -16,6 +16,12 @@
<Filter Include="win32">
<UniqueIdentifier>{afe7e35e-cd81-406c-a770-df29d2b3fc95}</UniqueIdentifier>
</Filter>
<Filter Include="deelx">
<UniqueIdentifier>{67242aad-9133-44e7-9774-c36f5a9194bc}</UniqueIdentifier>
</Filter>
<Filter Include="deelx\doc">
<UniqueIdentifier>{4e167b73-0447-4a31-a66b-64c2d684516d}</UniqueIdentifier>
</Filter>
</ItemGroup>
<ItemGroup>
<ClCompile Include="lexers\LexAHK.cxx">
@ -237,6 +243,18 @@
<ClCompile Include="win32\ScintillaWin.cxx">
<Filter>win32</Filter>
</ClCompile>
<ClCompile Include="deelx\DeelxRegexSearch.cxx">
<Filter>deelx</Filter>
</ClCompile>
<ClCompile Include="lexers\LexJSON.cxx">
<Filter>lexers</Filter>
</ClCompile>
<ClCompile Include="lexers\LexMatlab.cxx">
<Filter>lexers</Filter>
</ClCompile>
<ClCompile Include="lexers\LexRegistry.cxx">
<Filter>lexers</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="include\ILexer.h">
@ -398,6 +416,16 @@
<ClInclude Include="win32\PlatWin.h">
<Filter>win32</Filter>
</ClInclude>
<ClInclude Include="include\Sci_Position.h" />
<ClInclude Include="deelx\deelx64.h">
<Filter>deelx</Filter>
</ClInclude>
<ClInclude Include="include\Sci_Position.h">
<Filter>include</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<None Include="deelx\doc\deelx_en.chm">
<Filter>deelx\doc</Filter>
</None>
</ItemGroup>
</Project>

View File

@ -0,0 +1,330 @@
/**
* @file DeelxRegexSearch.cxx
* @brief integrate DeelX regex searching for Scintilla library
* (Scintilla Lib is copyright 1998-2016 by Neil Hodgson <neilh@scintilla.org>)
*
* uses DEELX - Regular Expression Engine (v1.3) (deelx.h) - http://www.regexlab.com/deelx/
* download: http://www.regexlab.com/download/deelx/deelx.zip (v1.2)
* or : https://github.com/AndreasMartin72/mksqlite/blob/master/deelx/deelx.h (v1.3)
* (Copyright Announcement: Free to use/redistribute. Provenance must be declared when redistributed)
* API documentation see accompanying "deelx_en.chm" HTML Help.
*
* @autor Rainer Kottenhoff (RaPeHoff)
*
* Install:
* - place files (deelx64.h, DeelxRegexSearch.cxx, deelx_en.chm)
* in a directory (deelx) within the scintilla project (.../scintilla/deelx/)
* - add source files to scintilla project (Scintilla.vcxproj in VS)
* - define compiler (preprocessor) macro for scintilla project named "SCI_OWNREGEX"
* -> this will switch from scintilla's buildin regex engine to deelx's regex engine
* - recompile and link scintilla library
* - build application
*/
#ifdef SCI_OWNREGEX
#include <stdlib.h>
#include <string>
#include <vector>
#pragma warning( push )
#pragma warning( disable : 4996 ) // Scintilla's "unsafe" use of std::copy() (SplitVector.h)
// // or use -D_SCL_SECURE_NO_WARNINGS preprocessor define
#include "Platform.h"
#include "Scintilla.h"
#include "ILexer.h"
#include "SplitVector.h"
#include "Partitioning.h"
#include "CellBuffer.h"
#include "CaseFolder.h"
#include "RunStyles.h"
#include "Decoration.h"
#include "CharClassify.h"
#include "Document.h"
// ---------------------------------------------------------------
#include "deelx64.h" // DEELX - Regular Expression Engine (v1.3)
// ---------------------------------------------------------------
#ifdef SCI_NAMESPACE
using namespace Scintilla;
#endif
class DeelxRegexSearch : public RegexSearchBase
{
public:
explicit DeelxRegexSearch(CharClassify* charClassTable)
: m_RegExpr()
, m_Match()
, m_MatchPos(-1)
, m_MatchLength(0)
, m_pContext(nullptr)
, m_SubstitutionBuffer(nullptr)
{}
virtual ~DeelxRegexSearch()
{
ReleaseSubstitutionBuffer();
ReleaseContext();
}
virtual long FindText(Document* doc, int minPos, int maxPos, const char* pattern,
bool caseSensitive, bool word, bool wordStart, int flags, int* length) override;
virtual const char* SubstituteByPosition(Document* doc, const char* text, int* length) override;
private:
inline void ReleaseContext()
{
if (m_pContext != nullptr) {
m_RegExpr.ReleaseContext(m_pContext);
m_pContext = nullptr;
}
}
inline void ReleaseSubstitutionBuffer()
{
if (m_SubstitutionBuffer) {
m_RegExpr.ReleaseString(m_SubstitutionBuffer);
m_SubstitutionBuffer = nullptr;
}
}
private:
deelx::CRegexpT<char> m_RegExpr;
deelx::MatchResult m_Match;
deelx::index_t m_MatchPos;
deelx::index_t m_MatchLength;
deelx::CContext* m_pContext;
char* m_SubstitutionBuffer;
};
// ============================================================================
#ifdef SCI_NAMESPACE
RegexSearchBase *Scintilla::CreateRegexSearch(CharClassify *charClassTable)
{
return new DeelxRegexSearch(charClassTable);
}
#else
RegexSearchBase *CreateRegexSearch(CharClassify *charClassTable)
{
return new DeelxRegexSearch(charClassTable);
}
#endif
// ============================================================================
/**
* forward declaration of utility functions
*/
std::string& translateRegExpr(std::string& regExprStr, bool wholeWord, bool wordStart);
std::string& convertReplExpr(std::string& replStr);
// ============================================================================
/**
* Find text in document, supporting both forward and backward
* searches (just pass minPos > maxPos to do a backward search)
* Has not been tested with backwards DBCS searches yet.
*/
long DeelxRegexSearch::FindText(Document* doc, int minPos, int maxPos, const char *pattern,
bool caseSensitive, bool word, bool wordStart, int searchFlags, int *length)
{
int startPos, endPos;
bool left2right;
if (minPos <= maxPos) {
left2right = true;
startPos = minPos;
endPos = maxPos;
}
else { // backward search
left2right = false;
startPos = maxPos;
endPos = minPos;
}
// Range endpoints should not be inside DBCS characters, but just in case, move them.
startPos = doc->MovePositionOutsideChar(startPos, 1, false);
endPos = doc->MovePositionOutsideChar(endPos, 1, false);
int compileFlags(deelx::MULTILINE | deelx::GLOBAL | deelx::EXTENDED); // the .(dot) does not match line-breaks
//int compileFlags(deelx::SINGLELINE | deelx::MULTILINE | deelx::GLOBAL | deelx::EXTENDED); // the .(dot) also matches line-breaks
compileFlags |= (caseSensitive) ? deelx::NO_FLAG : deelx::IGNORECASE;
compileFlags |= (left2right) ? deelx::NO_FLAG : deelx::RIGHTTOLEFT;
std::string sRegExprStrg = translateRegExpr(std::string(pattern, *length), word, wordStart);
try {
m_RegExpr.Compile(sRegExprStrg.c_str(), compileFlags);
}
catch (...) {
return -2; // -1 is normally used for not found, -2 is used here for invalid regex
}
int rangeLen = endPos - startPos;
int searchStartPos = left2right ? 0 : rangeLen;
ReleaseContext();
m_pContext = m_RegExpr.PrepareMatch(doc->RangePointer(startPos, rangeLen), searchStartPos);
m_Match = m_RegExpr.Match(m_pContext);
m_MatchPos = -1; // not found
m_MatchLength = 0;
if (m_Match.IsMatched()) {
m_MatchPos = startPos + m_Match.GetStart();
m_MatchLength = (m_Match.GetEnd() - m_Match.GetStart());
}
//NOTE: potential 64-bit-size issue at interface here:
*length = static_cast<int>(m_MatchLength);
return static_cast<long>(m_MatchPos);
}
// ============================================================================
const char* DeelxRegexSearch::SubstituteByPosition(Document* doc, const char* text, int* length)
{
if (!m_Match.IsMatched() || (m_MatchPos < 0)) {
*length = 0;
return nullptr;
}
std::string sReplStrg = convertReplExpr(std::string(text, *length));
//NOTE: potential 64-bit-size issue at interface here:
const char* pString = doc->RangePointer(static_cast<int>(m_MatchPos), static_cast<int>(m_MatchLength));
deelx::index_t resLength;
ReleaseSubstitutionBuffer();
m_SubstitutionBuffer = m_RegExpr.Replace(pString, m_MatchLength, sReplStrg.c_str(),
static_cast<deelx::index_t>(sReplStrg.length()), resLength);
//NOTE: potential 64-bit-size issue at interface here:
*length = static_cast<int>(resLength);
return m_SubstitutionBuffer;
}
// ============================================================================
// ============================================================================
// Some Helpers
// ============================================================================
void replaceAll(std::string& source, const std::string& from, const std::string& to)
{
std::string newString;
newString.reserve(source.length() * 2); // avoids a few memory allocations
std::string::size_type lastPos = 0;
std::string::size_type findPos;
while (std::string::npos != (findPos = source.find(from, lastPos))) {
newString.append(source, lastPos, findPos - lastPos);
newString += to;
lastPos = findPos + from.length();
}
// Care for the rest after last occurrence
newString += source.substr(lastPos);
source.swap(newString);
}
// ----------------------------------------------------------------------------
std::string& translateRegExpr(std::string& regExprStr, bool wholeWord, bool wordStart)
{
std::string tmpStr;
if (wholeWord || wordStart) { // push '\b' at the begin of regexpr
tmpStr.push_back('\\');
tmpStr.push_back('b');
tmpStr.append(regExprStr);
if (wholeWord) { // push '\b' at the end of regexpr
tmpStr.push_back('\\');
tmpStr.push_back('b');
}
replaceAll(tmpStr, ".", "\\w");
}
else {
tmpStr.append(regExprStr);
}
std::swap(regExprStr, tmpStr);
return regExprStr;
}
// ----------------------------------------------------------------------------
std::string& convertReplExpr(std::string& replStr)
{
std::string tmpStr;
for (size_t i = 0; i < replStr.length(); ++i) {
char ch = replStr[i];
if (ch == '\\') {
ch = replStr[++i]; // next char
if (ch == '\\') {
// skip 2nd backslash ("\\")
if (i < replStr.length()) { ch = replStr[++i]; }
else { break; }
}
if (ch >= '1' && ch <= '9') {
// former behavior convenience:
// change "\\<n>" to deelx's group reference ($<n>)
tmpStr.push_back('$');
}
switch (ch) {
// check for escape seq:
case 'a':
tmpStr.push_back('\a');
break;
case 'b':
tmpStr.push_back('\b');
break;
case 'f':
tmpStr.push_back('\f');
break;
case 'n':
tmpStr.push_back('\n');
break;
case 'r':
tmpStr.push_back('\r');
break;
case 't':
tmpStr.push_back('\t');
break;
case 'v':
tmpStr.push_back('\v');
break;
case '\\':
tmpStr.push_back('\\');
break;
default:
// unknown ctrl seq
tmpStr.push_back(ch);
break;
}
}
else {
tmpStr.push_back(ch);
}
} //for
std::swap(replStr, tmpStr);
return replStr;
}
// ============================================================================
#pragma warning( pop )
#endif //SCI_OWNREGEX

4830
scintilla/deelx/deelx64.h Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
[InternetShortcut]
URL=http://www.regexlab.com/en/deelx/introidx.htm

View File

@ -0,0 +1,426 @@
Regular Expression Syntax Reference http://www.regexlab.com/en/regref.htm
[All rights reserved: http://www.regexlab.com/en/regref.htm]
[Author: sswater shi (sswater@gmail.com)]
Introduction
Regular expression is to express a characteristic in a string, and then to match another string
with the characteristic. For example, pattern "ab+" means "one 'a' and at least one 'b' ", so "ab",
"abb", "abbbbbbb" match the pattern.
Regular expression is used to : (1) test a string whether it matches a pattern, such as a email
address. (2) to find a substring which matches certain pattern, from a whole text. (3) to do
complex replacement in a text.
It is very simple to study regular expression syntax, and the few abstract concepts can be
understood easily too. Many articles does not introduce its concepts from simple ones to
abstract ones step by step, so some persons may feel it is difficult to study. On the other hand,
each regular expression engine's document will describe its special function, but this part of
special function is not what we should study first.
1. Regular Expression Basic Syntax
1.1 Common Characters
Letters, numbers, the underline, and punctuations with no special definition are "common
characters". When regular expression matches a string, a common character can match the
same character.
- Example1: When pattern "c" matches string "abcde", match result: success; substring
matched: "c"; position: starts at 2, ends at 3.
- Example2: When pattern "bcd" matches string "abcde",match result: success; substring
matched: "bcd"; position: starts at 1, ends at 4.
1.2 Simple escaped characters
Nonprinting characters which we know:
Expression Matches
\r, \n Carriage return, newline character
\t Tabs
\\ Matches "\" itself
Some punctuations are specially defined in regular expression. To match these characters in
string, add "\" in pattern. For example: ^, $ has special definition, so we need to use "\^" and
"\$" to match them.
Expression Matches
\^ Matches "^" itself
\$ Matches "$" itself
\. Matches dot(.) itself
These escaped characters have the same effect as "common characters": to match a certain
character.
- Example1: When pattern "\$d" matches string "abc$de", match result: success; substring
matched: "$d"; position: starts at 3, ends at 5.
1.3 Expression matches anyone of many characters
Some expressions can match anyone of many characters. For example: "\d" can match any
number character. Each of these expressions can match only one character at one time, though
they can match any character of a certain group of characters.
Expression Matches
\d Any digit character, any one of 0~9
\w Any alpha, numeric, underline, any one of A~Z,a~z,0~9,_
\s Any one of space, tab, newline, return, or newpage character
. '.' matches any character except the newline character(\n)
- Example1: When pattern "\d\d" matches "abc123", match result: success; substring
matched: "12"; position: starts at 3, ends at 5.
- Example2: When pattern "a.\d" matches "aaa100", match result: success; substring
matched: "aa1"; position: starts at 1, ends at 4.
1.4 Custom expression matches anyone of many characters
Expression uses square brackets [ ] to contain a series of characters, it can match anyone of
them. Uses [^ ] to contain a series of characters, it can match anyone character except
characters contained.
Expression Matches
[ab5@] Matches "a" or "b" or "5" or "@"
[^abc] Matches any character except "a","b","c"
[f-k] Any character among "f"~"k"
[^A-F0-3] Any character except "A"~"F","0"~"3"
- Example1: When pattern "[bcd][bcd]" matches "abc123" , match result: success; substring
matched: "bc"; position: starts at 1, ends at 3.
- Example2: When pattern "[^abc]" matches "abc123", match result: success; substring
matched: "1"; position: starts at 3, ends at 4.
1.5 Special expression to quantify matching
All expressions introduced before can match character only one time. If a expression is
followed by a quantifier, it can matches more than one times.
For example: we can use the pattern "[bcd]{2}" instead of "[bcd][bcd]".
Expression Function
Match exactly n times, example: "\w{2}" equals "\w\w"; "a{5}"
{n}
equals "aaaaa"
{m,n} At least m but no more than n times: "ba{1,3}" matches "ba","baa","baaa"
{m,} Match at least n times: "\w\d{2,}" matches "a12","_456","M12344"...
? Match 1 or 0 times, equivalent to {0,1}: "a[cd]?" matches "a","ac","ad".
+ Match 1 or more times, equivalent to {1,}: "a+b" matches "ab","aab","aaab"...
* Match 0 or more times, equivalent to {0,}: "\^*b" matches "b","^^^b"...
- Example1: When pattern "\d+\.?\d*" matches "It costs $12.5", match result: success;
substring matched:"12.5"; position: starts at 10, ends at 14.
- Example2: When pattern "go{2,8}gle" matches "Ads by goooooogle", match result:
success; substring matched: "goooooogle"; position: starts at 7, ends at 17.
1.6 Some special punctuations with abstract function
Some punctuations in pattern have special function:
Expression Function
^ Match the beginning of the string
$ Match the end of the string
\b Match a word boundary
More examples to help you to understand.
- Example1: When pattern "^aaa" matches "xxx aaa xxx", match result: failed. Because "^"
must match the beginning of the string. It could match successfully on condition that "aaa" is
at the beginning of the string, such as "aaa xxx xxx".
- Example2: When pattern "aaa$" matches "xxx aaa xxx", match result: failed. Bacause "$"
must match the end of the string. It could match successfully on condition that "aaa" is at the
end of the string, such as "xxx xxx aaa".
- Example3: When pattern ".\b." matches "@@@abc", match result: success; substring
matched: "@a"; position: starts at 2, ends at 4.
Further explanation: "\b" is similar to "^" and "$", matches no character itself, but it require a
'\w' character at its one side, another not '\w' character at the other side.
- Example4: When pattern "\bend\b" matches "weekend,endfor,end", match result:
success; substring matched: "end"; position: starts at 15, ends at 18.
Some special punctuation can make effect on other sub-patterns:
Expression Function
| Alternation, matches either left side or right side
(1). Let sub-patterns in it to be a whole part when it is quantified.
( )
(2). Match result of sub-patterns in it can be retrieved individually
- Example5: When pattern "Tom|Jack" matches string "I'm Tom, he is Jack", match result:
success; substring matched: "Tom"; position: starts at 4, ends at 7. When match next, match
result: success; substring matched: "Jack"; position: starts at 15, ends at 19.
- Example6: When pattern "(go\s*)+" matches "Let's go go go!", match result: success;
substring matched: "go go go"; position: starts at 6, ends at 14.
- Example7: When pattern "?(\d+\.?\d*)" matches "$10.9,?20.5", match result: success;
substring matched: "?20.5"; position: starts at 6, ends at 10. Match result of sub-patterns
in "( )" is: "20.5".
2. Regular expression advanced syntax
2.1 Reluctant or greedy quantifiers
There are serval method to quantify subpattern, such as: "{m,n}", "{m,}", "?", "*", "+". By
default, a quantified subpattern is "greedy", that is, it will match as many times as possible
(given a particular starting location) while still allowing the rest of the pattern to match. For
example, to match "dxxxdxxxd":
Expression Match result
(d)(\w+) "\w+" matches all characters "xxxdxxxd" behind of "d"
"\w+" matches all characters "xxxdxxx" between the first "d" and the last
(d)(\w+)(d) "d". In order to let the whole pattern match success, "\w" has to give up the
last "d", although it can match the last "d" too.
Thus it can be seen that: when "\w+" matches, it will match as many characters as possible.
In the second example, it does not match the last "d", but this is in order to let the whole
pattern match successfully. Pattern with "*" or "{m,n}" will also match as many times as
possible, pattern with "?" will match if possible. This type of matching is called "greedy
matching". ?
Reluctant Matching:
To follow the quantifier with a "?", it can let the pattern to match the minimum number of
times possible. This type of matching is called reluctant matching. In order to let the whole
pattern match successfully, the reluctant pattern may match a few more times if it is required.
For example, to match "dxxxdxxxd":
Expression Match result
(d)(\w+?) "\w+?" match as few times as possible, so "\w+?" matches only one "x"
In order to let the whole pattern match successfully, "\w+?" has to match
(d)(\w+?)(d)
"xxx". So, match result is: "\w+?" matches "xxx"
More examples:
- Example1: When pattern "<td>(.*)</td>" matches "<td><p>aa</p></td>
<td><p>bb</p></td>", match result: success; substring matched: the whole
"<td><p>aa</p></td> <td><p>bb</p></td>", "</td>" in the pattern matches the last
"</td>" in the string.
- Example2: For comparison, when pattern "<td>(.*?)</td>" matches the string in
example1, it matches "<td><p>aa</p></td>". When match next, the next "<td><p>bb</p>
</td>" can be matched.
2.2 Referring to matched substring \1, \2...
During the process of matching, the match results of subpattern between parentheses "( )"
are recorded for later use. When retrieving match results, those match result of subpattern can
be retrieved individually, and this has been demonstrated many times in former examples. In
practice, parentheses "( )" must be used to get what we want indeed after match, such as
"<td>(.*?)</td>".
In fact, those match result of subpattern between parentheses can be used not only after
matching, but also during matching. The latter part of subpattern, can refer the match result of
former subpattern. Usage: "\" plus a number to refer to the corresponding substring. "\1" refers
to 1st pair of parentheses' match result, "\2" refers to 2nd pair of parentheses' match result.
Examples:
- Example1: When pattern "('|")(.*?)(\1)" matches " 'Hello', "World" ", match result: success;
substring matched: " 'Hello' "; when match next, substring matched: " "World" ".
- Example2: When pattern "(\w)\1{4,}" matches "aa bbbb abcdefg ccccc 111121111
999999999", match result: success; substring matched: "ccccc"; when match next, substring
matched "999999999". This pattern require a character of "\w" to repeat at least 5 times.
Pay attention to comparison with "\w{5,}".
- Example3: When pattern "<(\w+)\s*(\w+(=('|").*?\4)?\s*)*>.*?</\1>" matches "<td
id='td1' style="bgcolor:white"></td>", match result: success. If both "<td>" and "</td>" are
not "td", the match will fail.
2.2b DEELX Regular Expression Replace Syntax
$1 ~ $999 - Stands for what a certain group captured. If the number is larger than the max group number,
DEELX will use less digitals, till the number is smaller than or equal to the max group number.
For example:
If the max group number is 20, "$999" means "$9" and common string "99", while "$15" means the 15th group.
If you need "$1" and common string "5", you can use $0015 , DEELX at most recognize 3 digitals as number.
${name} - Stands for what a named group captured.
$$ - Stands for a single dollars ($).
$& - Stands for what the overall expression captured.
$` - The substring before the beginning of what the overall expression captured in the original text.
$' - The substring behind the end of what the overall expression captured in the original text.
$+ - Stands for what a group captured, which group has the max group number among those groups
that have captured. For example: when "aaa(b+)|ccc(b+)" matches "aaabbb" ,
$+ stands for $1, even though $2 has the max group number.
$_ - Stands for the whole original text.
2.3 Lookahead assertion; Lookbehind assertion
In former chapters, I have introduced serval punctuations with special function:
"^","$","\b". They all do not match any characters, but they all require certain conditions on
their position. Now, this chapter will introduce more methods to add conditions on the gap
between characters.
Lookahead assertion: "(?=xxxxx)", "(?!xxxxx)"
Format: "(?=xxxxx)", the condition which it add on the gap is that: string on the right side of
the gap must be abe to match the subpattern "xxxxx" between the parentheses. It is just a
condition, not a match operation, so there is no match result.
- Example1: When pattern "Windows (?=NT|XP)" matches "Windows 98, Windows NT,
Windows 2000", it can match only "Windows " of "Windows NT", the other "Windows " could
not be matched.
- Example2: When pattern "(\w)((?=\1\1\1)(\1))+" matches "aaa ffffff 999999999", it can
match first 4 "f"s among the 6 "f"s, it can match first 7 "9"s among 9 "9"s.
Format: "(?!xxxxx)", string on the right side of the gap must not be able to match the
subpattern "xxxxx".
- Example3: When pattern "((?!\bstop\b).)+" matches "fdjka ljfdl stop fjdsla fdj", it will
match from the beginning of string to the position of "stop". If there is no "stop" in the string,
the pattern will match the whole string.
- Example4: When pattern "do(?!\w)" matches "done, do, dog", it can only match "do".
Here, "(?!\w)" has the same effect as "\b".
Lookbehind assertion: "(?<=xxxxx)", "(?<!xxxxx)"
The concepts of "Lookbehind assertion" and "Lookahead assertion" are similar. "(?<=xxxxx)"
and "(?<!xxxxx)" require the string on the left side of the gap to be able to match or to be not
able to match the subpattern, not the right side. And they will not match any characters
themselves too.
Example5: When pattern "(?<=\d{4})\d+(?=\d{4})" matches "1234567890123456", it will
match 8 numbers in the middle, except first 4 numbers and last 4 numbers. Because
lookbehind assertion is not supported by JScript.RegExp, this example could not be
demonstrated. There are many engines support lookbehind assertion, such as java.util.regex
package in Java 1.4 or later, System.Text.RegularExpressions namespace in .NET platform, and
DEELX Regexp Engine etc.
3. Other usually supported rules
There are several usually supported rules which have not been mentioned.
3.1 In pattern, a character can be expressed as "\xXX" or "\uXXXX" ("X" is a hex number)
Format Character range
\xXX 0 ~ 255, such as space can be "\x20"
\uXXXX Any character can be expressed as "\u" plus 4 hex numbers, such as "\u4E2D"
3.2 While "\s", "\d", "\w", "\b" are specially defined, their uppercase letters have the opposite
meaning
Pattern Matches
\S All characters except spaces
\D All characters except numeric characters
\W All characters except alpha, numeric, "_"
\B Characters' gap which is not a word boundary
3.3 Specially defined characters table
Character Description
^ Matches the beginning of the string. Use "\^" to match "^" itself
$ Matches the end of the string. Use "\$" to match "$" itself
( ) Grouping. Use "\(" and "\)" to match "(" and ")"
[ ] Character class. Use "\[" and "\]" to match "[" and "]"
{ } Define quantifiers. Use "\{" and "\}" to match "{" and "}"
. Match any character except newline(\n). Use "\." to match "." itself
? Let subpattern match 0 or 1 time. Use "\?" to match "?" itself
+ Let subpattern match at least 1 times. Use "\+" to match "+" itself
* Let subpattern match any times. Use "\*" to match "*" itself
| Alternation. Use "\|" to match "|" itself
3.4 If a subpattern is in "(?:xxxxx)", the match result is not recorded for later use.
- Example1: When pattern "(?:(\w)\1)+" matches "a bbccdd efg", the substring matched:
"bbccdd". The match result of subpattern in "(?:)" is not recorded, so "\1" is used to refer to the
match result of "(\w)".
3.5 Pattern attribute: Ignorecase,Singleline,Multiline,Global
Attribute Description
Ignorecase Do case-insensitive pattern matching. Default is case-sensitive.
Treat string as single line. That is, change "." to match any character
Singleline
whatsoever, even a newline, which it normally would not match.
Treat string as multiple lines. Default is that "^" and "$" match at only the very
start? of the string and end? of the string. If multiline, they match the
start? of any line and end? of any line within the string:
Multiline
?xxxxxxxxx?\n
?xxxxxxxxx?
Global Replace all matches if the pattern is used in replace operation.
4. Integrated prompt
4.1 If you want to know what else are implemented by advanced engines, you can refer to
DEELX Syntax on this site.
4.2 If the pattern is required to match the whole string, not a part of string, we may use "^" and
"$", such as: "^\d+$" require the whole string consist of digit characters.
4.3 If the pattern is required to match a whole word, not a part of word, we may use "\b" at the
beginning and the end of the pattern, such as: use "\b(if|while|else|void|int……)\b" to match
keywords in a program.
4.4 Do not let pattern match empty string "". Or you will get an empty substring matched,
while the match operation returns success. For example: if we need a pattern to match
"123"?"123."?"123.5"?".5", we should not use this pattern "\d*\.?\d*". Though there is
nothing, we may still get a success. Proper pattern: "\d+\.?\d*|\.\d+".
4.5 Do not let a subpattern loop infinite times if the subpattern can match empty string.
4.6 Choose reluctant or greedy quantifier properly.
4.7 Only one side of "|" to match a certain character.
Author: sswater shi.
RegExLab.com © 2005 - 2016 All Rights Reserved

View File

@ -0,0 +1,2 @@
[InternetShortcut]
URL=http://www.regexlab.com/en/regref.htm

Binary file not shown.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1356,7 +1356,7 @@ END
STRINGTABLE
BEGIN
IDS_BACKSLASHHELP "Backslash Transformations\n\n\\a\tAlert (BEL, Ascii 7)\n\\b\tBackspace (BS, Ascii 8)\n\\f\tFormfeed (FF, Ascii 12)\n\\n\tNewline (LF, Ascii 10)\n\\r\tCarriage return (CR, Ascii 13)\n\\t\tHorizontal Tab (HT, Ascii 9)\n\\v\tVertical Tab (VT, Ascii 11)\n\\ooo\tOctal Value\n\\u####\tHexadecimal Value\n\\xhh\tHexadecimal Value\n\\\\\tBackslash"
IDS_REGEXPHELP "RegExp Syntax (Single Lines Only)\n\n.\tAny character\n^\tStart of a line\n$\tEnd of a line\n\\<\tStart of a word\n\\>\tEnd of a word\n[...]\tA set of chars ([abc]) or a range ([a-z])\n[^...]\tChars NOT in the set or range\n\\d\tAny decimal digit\n\\D\tAny non-digit char\n\\s\tAny whitespace char\n\\S\tNot a whitespace char\n\\w\tAny ""word"" char\n\\W\tAny ""non-word"" char\n\\x\tEscape character with otherwise special meaning\n\\xHH\tChar with hex code HH\n?\tMatches preceding 0 or 1 times\n*\tMatches preceding 0 or more times\n+\tMatches preceding 1 or more times\n*? or +?\tNon greedy matching of quantifiers ""?"" and ""+""\n(\tStart of a region\n)\tEnd of a region\n\\n\tRefers to a region when replacing (n is 1-9)\n"
IDS_REGEXPHELP "RegExp Syntax (Multi Lines)\n\n.\tAny character, except line-breaks\n^\tStart of a line\n$\tEnd of a line\n\\<\tStart of a word\n\\>\tEnd of a word\n[...]\tA set of chars ([abc]) or a range ([a-z])\n[^...]\tChars NOT in the set or range\n\\d\tAny decimal digit\n\\D\tAny non-digit char\n\\s\tAny whitespace char\n\\S\tNot a whitespace char\n\\w\tAny ""word"" char\n\\W\tAny ""non-word"" char\n\\x\tEscape character with otherwise special meaning\n\\xHH\tChar with hex code HH\n?\tMatches preceding 0 or 1 times\n*\tMatches preceding 0 or more times\n+\tMatches preceding 1 or more times\n*? or +?\tNon greedy matching of quantifiers ""?"" and ""+""\n(\tStart of a region\n)\tEnd of a region\n\\n\tRefers to a region when replacing (n is 1-9)\n"
IDS_WILDCARDHELP "Wildcard Search\n\n*\tMatches zero or more characters.\n?\tMatches exactly one character. "
END