mirror of
https://github.com/rizonesoft/Notepad3.git
synced 2026-06-14 21:09:05 +08:00
+ Integration of DeelX RegExpr engine ( see http://www.regexlab.com/en/deelx/ )
Notepad2 and also the maintaining fork notepad2-mod are using Scintilla's internal regexpr engine, which has its limitations ( see XhmikosR/notepad2-mod#148 ). In wise forsight, the developer of Scintilla creates an interface (activated by preprocessor define SCI_OWNREGEX), to embed your own RegExpr search (and replace) engine.
This commit is contained in:
parent
3dcab849aa
commit
1d526576c9
@ -121,7 +121,7 @@
|
||||
<MultiProcessorCompilation>true</MultiProcessorCompilation>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PrecompiledHeader>NotUsing</PrecompiledHeader>
|
||||
<PreprocessorDefinitions>_SCL_SECURE_NO_WARNINGS;WIN32;_DEBUG;_WINDOWS;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;STATIC_BUILD;SCI_LEXER;USE_D2D;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<PreprocessorDefinitions>_SCL_SECURE_NO_WARNINGS;WIN32;SCI_OWNREGEX;_DEBUG;_WINDOWS;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;STATIC_BUILD;SCI_LEXER;USE_D2D;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
</ClCompile>
|
||||
@ -134,7 +134,7 @@
|
||||
<MultiProcessorCompilation>true</MultiProcessorCompilation>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PrecompiledHeader>NotUsing</PrecompiledHeader>
|
||||
<PreprocessorDefinitions>_SCL_SECURE_NO_WARNINGS;_WIN64;_DEBUG;_WINDOWS;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;STATIC_BUILD;SCI_LEXER;USE_D2D;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<PreprocessorDefinitions>_SCL_SECURE_NO_WARNINGS;_WIN64;SCI_OWNREGEX;_DEBUG;_WINDOWS;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;STATIC_BUILD;SCI_LEXER;USE_D2D;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
</ClCompile>
|
||||
@ -150,7 +150,7 @@
|
||||
<MultiProcessorCompilation>true</MultiProcessorCompilation>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<PrecompiledHeader>NotUsing</PrecompiledHeader>
|
||||
<PreprocessorDefinitions>_SCL_SECURE_NO_WARNINGS;WIN32;NDEBUG;_WINDOWS;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;STATIC_BUILD;SCI_LEXER;USE_D2D;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<PreprocessorDefinitions>_SCL_SECURE_NO_WARNINGS;WIN32;SCI_OWNREGEX;NDEBUG;_WINDOWS;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;STATIC_BUILD;SCI_LEXER;USE_D2D;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
</ClCompile>
|
||||
@ -162,7 +162,7 @@
|
||||
<MultiProcessorCompilation>true</MultiProcessorCompilation>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<PrecompiledHeader>NotUsing</PrecompiledHeader>
|
||||
<PreprocessorDefinitions>_SCL_SECURE_NO_WARNINGS;_WIN64;NDEBUG;_WINDOWS;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;STATIC_BUILD;SCI_LEXER;USE_D2D;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<PreprocessorDefinitions>_SCL_SECURE_NO_WARNINGS;_WIN64;SCI_OWNREGEX;NDEBUG;_WINDOWS;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;STATIC_BUILD;SCI_LEXER;USE_D2D;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
</ClCompile>
|
||||
@ -171,6 +171,7 @@
|
||||
</Lib>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="deelx\DeelxRegexSearch.cxx" />
|
||||
<ClCompile Include="lexers\LexAHK.cxx" />
|
||||
<ClCompile Include="lexers\LexAsm.cxx" />
|
||||
<ClCompile Include="lexers\LexAU3.cxx" />
|
||||
@ -249,6 +250,7 @@
|
||||
<ClCompile Include="win32\ScintillaWin.cxx" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="deelx\deelx64.h" />
|
||||
<ClInclude Include="include\ILexer.h" />
|
||||
<ClInclude Include="include\Platform.h" />
|
||||
<ClInclude Include="include\SciLexer.h" />
|
||||
@ -304,6 +306,9 @@
|
||||
<ClInclude Include="src\XPM.h" />
|
||||
<ClInclude Include="win32\PlatWin.h" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Include="deelx\doc\deelx_en.chm" />
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
|
||||
@ -16,6 +16,12 @@
|
||||
<Filter Include="win32">
|
||||
<UniqueIdentifier>{afe7e35e-cd81-406c-a770-df29d2b3fc95}</UniqueIdentifier>
|
||||
</Filter>
|
||||
<Filter Include="deelx">
|
||||
<UniqueIdentifier>{67242aad-9133-44e7-9774-c36f5a9194bc}</UniqueIdentifier>
|
||||
</Filter>
|
||||
<Filter Include="deelx\doc">
|
||||
<UniqueIdentifier>{4e167b73-0447-4a31-a66b-64c2d684516d}</UniqueIdentifier>
|
||||
</Filter>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="lexers\LexAHK.cxx">
|
||||
@ -237,6 +243,18 @@
|
||||
<ClCompile Include="win32\ScintillaWin.cxx">
|
||||
<Filter>win32</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="deelx\DeelxRegexSearch.cxx">
|
||||
<Filter>deelx</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="lexers\LexJSON.cxx">
|
||||
<Filter>lexers</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="lexers\LexMatlab.cxx">
|
||||
<Filter>lexers</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="lexers\LexRegistry.cxx">
|
||||
<Filter>lexers</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="include\ILexer.h">
|
||||
@ -398,6 +416,16 @@
|
||||
<ClInclude Include="win32\PlatWin.h">
|
||||
<Filter>win32</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="include\Sci_Position.h" />
|
||||
<ClInclude Include="deelx\deelx64.h">
|
||||
<Filter>deelx</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="include\Sci_Position.h">
|
||||
<Filter>include</Filter>
|
||||
</ClInclude>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Include="deelx\doc\deelx_en.chm">
|
||||
<Filter>deelx\doc</Filter>
|
||||
</None>
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
330
scintilla/deelx/DeelxRegexSearch.cxx
Normal file
330
scintilla/deelx/DeelxRegexSearch.cxx
Normal file
@ -0,0 +1,330 @@
|
||||
/**
|
||||
* @file DeelxRegexSearch.cxx
|
||||
* @brief integrate DeelX regex searching for Scintilla library
|
||||
* (Scintilla Lib is copyright 1998-2016 by Neil Hodgson <neilh@scintilla.org>)
|
||||
*
|
||||
* uses DEELX - Regular Expression Engine (v1.3) (deelx.h) - http://www.regexlab.com/deelx/
|
||||
* download: http://www.regexlab.com/download/deelx/deelx.zip (v1.2)
|
||||
* or : https://github.com/AndreasMartin72/mksqlite/blob/master/deelx/deelx.h (v1.3)
|
||||
* (Copyright Announcement: Free to use/redistribute. Provenance must be declared when redistributed)
|
||||
* API documentation see accompanying "deelx_en.chm" HTML Help.
|
||||
*
|
||||
* @autor Rainer Kottenhoff (RaPeHoff)
|
||||
*
|
||||
* Install:
|
||||
* - place files (deelx64.h, DeelxRegexSearch.cxx, deelx_en.chm)
|
||||
* in a directory (deelx) within the scintilla project (.../scintilla/deelx/)
|
||||
* - add source files to scintilla project (Scintilla.vcxproj in VS)
|
||||
* - define compiler (preprocessor) macro for scintilla project named "SCI_OWNREGEX"
|
||||
* -> this will switch from scintilla's buildin regex engine to deelx's regex engine
|
||||
* - recompile and link scintilla library
|
||||
* - build application
|
||||
*/
|
||||
|
||||
#ifdef SCI_OWNREGEX
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#pragma warning( push )
|
||||
#pragma warning( disable : 4996 ) // Scintilla's "unsafe" use of std::copy() (SplitVector.h)
|
||||
// // or use -D_SCL_SECURE_NO_WARNINGS preprocessor define
|
||||
|
||||
#include "Platform.h"
|
||||
#include "Scintilla.h"
|
||||
#include "ILexer.h"
|
||||
#include "SplitVector.h"
|
||||
#include "Partitioning.h"
|
||||
#include "CellBuffer.h"
|
||||
#include "CaseFolder.h"
|
||||
#include "RunStyles.h"
|
||||
#include "Decoration.h"
|
||||
#include "CharClassify.h"
|
||||
#include "Document.h"
|
||||
// ---------------------------------------------------------------
|
||||
#include "deelx64.h" // DEELX - Regular Expression Engine (v1.3)
|
||||
// ---------------------------------------------------------------
|
||||
|
||||
#ifdef SCI_NAMESPACE
|
||||
using namespace Scintilla;
|
||||
#endif
|
||||
|
||||
class DeelxRegexSearch : public RegexSearchBase
|
||||
{
|
||||
public:
|
||||
|
||||
explicit DeelxRegexSearch(CharClassify* charClassTable)
|
||||
: m_RegExpr()
|
||||
, m_Match()
|
||||
, m_MatchPos(-1)
|
||||
, m_MatchLength(0)
|
||||
, m_pContext(nullptr)
|
||||
, m_SubstitutionBuffer(nullptr)
|
||||
{}
|
||||
|
||||
virtual ~DeelxRegexSearch()
|
||||
{
|
||||
ReleaseSubstitutionBuffer();
|
||||
ReleaseContext();
|
||||
}
|
||||
|
||||
virtual long FindText(Document* doc, int minPos, int maxPos, const char* pattern,
|
||||
bool caseSensitive, bool word, bool wordStart, int flags, int* length) override;
|
||||
|
||||
virtual const char* SubstituteByPosition(Document* doc, const char* text, int* length) override;
|
||||
|
||||
|
||||
private:
|
||||
|
||||
inline void ReleaseContext()
|
||||
{
|
||||
if (m_pContext != nullptr) {
|
||||
m_RegExpr.ReleaseContext(m_pContext);
|
||||
m_pContext = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
inline void ReleaseSubstitutionBuffer()
|
||||
{
|
||||
if (m_SubstitutionBuffer) {
|
||||
m_RegExpr.ReleaseString(m_SubstitutionBuffer);
|
||||
m_SubstitutionBuffer = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
deelx::CRegexpT<char> m_RegExpr;
|
||||
deelx::MatchResult m_Match;
|
||||
deelx::index_t m_MatchPos;
|
||||
deelx::index_t m_MatchLength;
|
||||
deelx::CContext* m_pContext;
|
||||
char* m_SubstitutionBuffer;
|
||||
};
|
||||
// ============================================================================
|
||||
|
||||
|
||||
#ifdef SCI_NAMESPACE
|
||||
RegexSearchBase *Scintilla::CreateRegexSearch(CharClassify *charClassTable)
|
||||
{
|
||||
return new DeelxRegexSearch(charClassTable);
|
||||
}
|
||||
#else
|
||||
RegexSearchBase *CreateRegexSearch(CharClassify *charClassTable)
|
||||
{
|
||||
return new DeelxRegexSearch(charClassTable);
|
||||
}
|
||||
#endif
|
||||
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* forward declaration of utility functions
|
||||
*/
|
||||
std::string& translateRegExpr(std::string& regExprStr, bool wholeWord, bool wordStart);
|
||||
std::string& convertReplExpr(std::string& replStr);
|
||||
|
||||
|
||||
// ============================================================================
|
||||
|
||||
|
||||
/**
|
||||
* Find text in document, supporting both forward and backward
|
||||
* searches (just pass minPos > maxPos to do a backward search)
|
||||
* Has not been tested with backwards DBCS searches yet.
|
||||
*/
|
||||
long DeelxRegexSearch::FindText(Document* doc, int minPos, int maxPos, const char *pattern,
|
||||
bool caseSensitive, bool word, bool wordStart, int searchFlags, int *length)
|
||||
{
|
||||
int startPos, endPos;
|
||||
bool left2right;
|
||||
|
||||
if (minPos <= maxPos) {
|
||||
left2right = true;
|
||||
startPos = minPos;
|
||||
endPos = maxPos;
|
||||
}
|
||||
else { // backward search
|
||||
left2right = false;
|
||||
startPos = maxPos;
|
||||
endPos = minPos;
|
||||
}
|
||||
|
||||
// Range endpoints should not be inside DBCS characters, but just in case, move them.
|
||||
startPos = doc->MovePositionOutsideChar(startPos, 1, false);
|
||||
endPos = doc->MovePositionOutsideChar(endPos, 1, false);
|
||||
|
||||
int compileFlags(deelx::MULTILINE | deelx::GLOBAL | deelx::EXTENDED); // the .(dot) does not match line-breaks
|
||||
//int compileFlags(deelx::SINGLELINE | deelx::MULTILINE | deelx::GLOBAL | deelx::EXTENDED); // the .(dot) also matches line-breaks
|
||||
compileFlags |= (caseSensitive) ? deelx::NO_FLAG : deelx::IGNORECASE;
|
||||
compileFlags |= (left2right) ? deelx::NO_FLAG : deelx::RIGHTTOLEFT;
|
||||
|
||||
std::string sRegExprStrg = translateRegExpr(std::string(pattern, *length), word, wordStart);
|
||||
|
||||
try {
|
||||
m_RegExpr.Compile(sRegExprStrg.c_str(), compileFlags);
|
||||
}
|
||||
catch (...) {
|
||||
return -2; // -1 is normally used for not found, -2 is used here for invalid regex
|
||||
}
|
||||
|
||||
int rangeLen = endPos - startPos;
|
||||
int searchStartPos = left2right ? 0 : rangeLen;
|
||||
ReleaseContext();
|
||||
m_pContext = m_RegExpr.PrepareMatch(doc->RangePointer(startPos, rangeLen), searchStartPos);
|
||||
|
||||
m_Match = m_RegExpr.Match(m_pContext);
|
||||
|
||||
m_MatchPos = -1; // not found
|
||||
m_MatchLength = 0;
|
||||
if (m_Match.IsMatched()) {
|
||||
m_MatchPos = startPos + m_Match.GetStart();
|
||||
m_MatchLength = (m_Match.GetEnd() - m_Match.GetStart());
|
||||
}
|
||||
|
||||
//NOTE: potential 64-bit-size issue at interface here:
|
||||
*length = static_cast<int>(m_MatchLength);
|
||||
return static_cast<long>(m_MatchPos);
|
||||
}
|
||||
// ============================================================================
|
||||
|
||||
|
||||
const char* DeelxRegexSearch::SubstituteByPosition(Document* doc, const char* text, int* length)
|
||||
{
|
||||
if (!m_Match.IsMatched() || (m_MatchPos < 0)) {
|
||||
*length = 0;
|
||||
return nullptr;
|
||||
}
|
||||
std::string sReplStrg = convertReplExpr(std::string(text, *length));
|
||||
|
||||
//NOTE: potential 64-bit-size issue at interface here:
|
||||
const char* pString = doc->RangePointer(static_cast<int>(m_MatchPos), static_cast<int>(m_MatchLength));
|
||||
|
||||
deelx::index_t resLength;
|
||||
ReleaseSubstitutionBuffer();
|
||||
m_SubstitutionBuffer = m_RegExpr.Replace(pString, m_MatchLength, sReplStrg.c_str(),
|
||||
static_cast<deelx::index_t>(sReplStrg.length()), resLength);
|
||||
|
||||
//NOTE: potential 64-bit-size issue at interface here:
|
||||
*length = static_cast<int>(resLength);
|
||||
|
||||
return m_SubstitutionBuffer;
|
||||
}
|
||||
// ============================================================================
|
||||
|
||||
|
||||
|
||||
|
||||
// ============================================================================
|
||||
// Some Helpers
|
||||
// ============================================================================
|
||||
|
||||
|
||||
void replaceAll(std::string& source, const std::string& from, const std::string& to)
|
||||
{
|
||||
std::string newString;
|
||||
newString.reserve(source.length() * 2); // avoids a few memory allocations
|
||||
|
||||
std::string::size_type lastPos = 0;
|
||||
std::string::size_type findPos;
|
||||
|
||||
while (std::string::npos != (findPos = source.find(from, lastPos))) {
|
||||
newString.append(source, lastPos, findPos - lastPos);
|
||||
newString += to;
|
||||
lastPos = findPos + from.length();
|
||||
}
|
||||
// Care for the rest after last occurrence
|
||||
newString += source.substr(lastPos);
|
||||
|
||||
source.swap(newString);
|
||||
}
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
std::string& translateRegExpr(std::string& regExprStr, bool wholeWord, bool wordStart)
|
||||
{
|
||||
std::string tmpStr;
|
||||
|
||||
if (wholeWord || wordStart) { // push '\b' at the begin of regexpr
|
||||
tmpStr.push_back('\\');
|
||||
tmpStr.push_back('b');
|
||||
tmpStr.append(regExprStr);
|
||||
if (wholeWord) { // push '\b' at the end of regexpr
|
||||
tmpStr.push_back('\\');
|
||||
tmpStr.push_back('b');
|
||||
}
|
||||
replaceAll(tmpStr, ".", "\\w");
|
||||
}
|
||||
else {
|
||||
tmpStr.append(regExprStr);
|
||||
}
|
||||
std::swap(regExprStr, tmpStr);
|
||||
return regExprStr;
|
||||
}
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
std::string& convertReplExpr(std::string& replStr)
|
||||
{
|
||||
std::string tmpStr;
|
||||
for (size_t i = 0; i < replStr.length(); ++i) {
|
||||
char ch = replStr[i];
|
||||
if (ch == '\\') {
|
||||
ch = replStr[++i]; // next char
|
||||
if (ch == '\\') {
|
||||
// skip 2nd backslash ("\\")
|
||||
if (i < replStr.length()) { ch = replStr[++i]; }
|
||||
else { break; }
|
||||
}
|
||||
if (ch >= '1' && ch <= '9') {
|
||||
// former behavior convenience:
|
||||
// change "\\<n>" to deelx's group reference ($<n>)
|
||||
tmpStr.push_back('$');
|
||||
}
|
||||
switch (ch) {
|
||||
// check for escape seq:
|
||||
case 'a':
|
||||
tmpStr.push_back('\a');
|
||||
break;
|
||||
case 'b':
|
||||
tmpStr.push_back('\b');
|
||||
break;
|
||||
case 'f':
|
||||
tmpStr.push_back('\f');
|
||||
break;
|
||||
case 'n':
|
||||
tmpStr.push_back('\n');
|
||||
break;
|
||||
case 'r':
|
||||
tmpStr.push_back('\r');
|
||||
break;
|
||||
case 't':
|
||||
tmpStr.push_back('\t');
|
||||
break;
|
||||
case 'v':
|
||||
tmpStr.push_back('\v');
|
||||
break;
|
||||
case '\\':
|
||||
tmpStr.push_back('\\');
|
||||
break;
|
||||
default:
|
||||
// unknown ctrl seq
|
||||
tmpStr.push_back(ch);
|
||||
break;
|
||||
}
|
||||
}
|
||||
else {
|
||||
tmpStr.push_back(ch);
|
||||
}
|
||||
} //for
|
||||
|
||||
std::swap(replStr, tmpStr);
|
||||
return replStr;
|
||||
}
|
||||
// ============================================================================
|
||||
|
||||
#pragma warning( pop )
|
||||
|
||||
#endif //SCI_OWNREGEX
|
||||
4830
scintilla/deelx/deelx64.h
Normal file
4830
scintilla/deelx/deelx64.h
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,2 @@
|
||||
[InternetShortcut]
|
||||
URL=http://www.regexlab.com/en/deelx/introidx.htm
|
||||
@ -0,0 +1,426 @@
|
||||
Regular Expression Syntax Reference http://www.regexlab.com/en/regref.htm
|
||||
|
||||
[All rights reserved: http://www.regexlab.com/en/regref.htm]
|
||||
[Author: sswater shi (sswater@gmail.com)]
|
||||
|
||||
|
||||
Introduction
|
||||
|
||||
Regular expression is to express a characteristic in a string, and then to match another string
|
||||
with the characteristic. For example, pattern "ab+" means "one 'a' and at least one 'b' ", so "ab",
|
||||
"abb", "abbbbbbb" match the pattern.
|
||||
|
||||
Regular expression is used to : (1) test a string whether it matches a pattern, such as a email
|
||||
address. (2) to find a substring which matches certain pattern, from a whole text. (3) to do
|
||||
complex replacement in a text.
|
||||
|
||||
It is very simple to study regular expression syntax, and the few abstract concepts can be
|
||||
understood easily too. Many articles does not introduce its concepts from simple ones to
|
||||
abstract ones step by step, so some persons may feel it is difficult to study. On the other hand,
|
||||
each regular expression engine's document will describe its special function, but this part of
|
||||
special function is not what we should study first.
|
||||
|
||||
|
||||
1. Regular Expression Basic Syntax
|
||||
|
||||
1.1 Common Characters
|
||||
|
||||
Letters, numbers, the underline, and punctuations with no special definition are "common
|
||||
characters". When regular expression matches a string, a common character can match the
|
||||
same character.
|
||||
|
||||
- Example1: When pattern "c" matches string "abcde", match result: success; substring
|
||||
matched: "c"; position: starts at 2, ends at 3.
|
||||
|
||||
- Example2: When pattern "bcd" matches string "abcde",match result: success; substring
|
||||
matched: "bcd"; position: starts at 1, ends at 4.
|
||||
|
||||
|
||||
1.2 Simple escaped characters
|
||||
|
||||
Nonprinting characters which we know:
|
||||
|
||||
Expression Matches
|
||||
\r, \n Carriage return, newline character
|
||||
\t Tabs
|
||||
\\ Matches "\" itself
|
||||
|
||||
Some punctuations are specially defined in regular expression. To match these characters in
|
||||
string, add "\" in pattern. For example: ^, $ has special definition, so we need to use "\^" and
|
||||
"\$" to match them.
|
||||
|
||||
|
||||
Expression Matches
|
||||
\^ Matches "^" itself
|
||||
\$ Matches "$" itself
|
||||
|
||||
\. Matches dot(.) itself
|
||||
|
||||
These escaped characters have the same effect as "common characters": to match a certain
|
||||
character.
|
||||
|
||||
|
||||
- Example1: When pattern "\$d" matches string "abc$de", match result: success; substring
|
||||
matched: "$d"; position: starts at 3, ends at 5.
|
||||
|
||||
|
||||
1.3 Expression matches anyone of many characters
|
||||
|
||||
Some expressions can match anyone of many characters. For example: "\d" can match any
|
||||
number character. Each of these expressions can match only one character at one time, though
|
||||
they can match any character of a certain group of characters.
|
||||
|
||||
Expression Matches
|
||||
|
||||
\d Any digit character, any one of 0~9
|
||||
\w Any alpha, numeric, underline, any one of A~Z,a~z,0~9,_
|
||||
\s Any one of space, tab, newline, return, or newpage character
|
||||
|
||||
. '.' matches any character except the newline character(\n)
|
||||
|
||||
- Example1: When pattern "\d\d" matches "abc123", match result: success; substring
|
||||
matched: "12"; position: starts at 3, ends at 5.
|
||||
|
||||
- Example2: When pattern "a.\d" matches "aaa100", match result: success; substring
|
||||
matched: "aa1"; position: starts at 1, ends at 4.
|
||||
|
||||
|
||||
1.4 Custom expression matches anyone of many characters
|
||||
|
||||
Expression uses square brackets [ ] to contain a series of characters, it can match anyone of
|
||||
them. Uses [^ ] to contain a series of characters, it can match anyone character except
|
||||
characters contained.
|
||||
|
||||
Expression Matches
|
||||
|
||||
[ab5@] Matches "a" or "b" or "5" or "@"
|
||||
[^abc] Matches any character except "a","b","c"
|
||||
|
||||
[f-k] Any character among "f"~"k"
|
||||
[^A-F0-3] Any character except "A"~"F","0"~"3"
|
||||
|
||||
- Example1: When pattern "[bcd][bcd]" matches "abc123" , match result: success; substring
|
||||
matched: "bc"; position: starts at 1, ends at 3.
|
||||
|
||||
- Example2: When pattern "[^abc]" matches "abc123", match result: success; substring
|
||||
matched: "1"; position: starts at 3, ends at 4.
|
||||
|
||||
|
||||
1.5 Special expression to quantify matching
|
||||
|
||||
All expressions introduced before can match character only one time. If a expression is
|
||||
followed by a quantifier, it can matches more than one times.
|
||||
|
||||
|
||||
For example: we can use the pattern "[bcd]{2}" instead of "[bcd][bcd]".
|
||||
|
||||
Expression Function
|
||||
Match exactly n times, example: "\w{2}" equals "\w\w"; "a{5}"
|
||||
{n}
|
||||
equals "aaaaa"
|
||||
{m,n} At least m but no more than n times: "ba{1,3}" matches "ba","baa","baaa"
|
||||
|
||||
{m,} Match at least n times: "\w\d{2,}" matches "a12","_456","M12344"...
|
||||
? Match 1 or 0 times, equivalent to {0,1}: "a[cd]?" matches "a","ac","ad".
|
||||
|
||||
+ Match 1 or more times, equivalent to {1,}: "a+b" matches "ab","aab","aaab"...
|
||||
* Match 0 or more times, equivalent to {0,}: "\^*b" matches "b","^^^b"...
|
||||
|
||||
|
||||
- Example1: When pattern "\d+\.?\d*" matches "It costs $12.5", match result: success;
|
||||
substring matched:"12.5"; position: starts at 10, ends at 14.
|
||||
|
||||
|
||||
- Example2: When pattern "go{2,8}gle" matches "Ads by goooooogle", match result:
|
||||
success; substring matched: "goooooogle"; position: starts at 7, ends at 17.
|
||||
|
||||
|
||||
|
||||
1.6 Some special punctuations with abstract function
|
||||
|
||||
Some punctuations in pattern have special function:
|
||||
|
||||
Expression Function
|
||||
^ Match the beginning of the string
|
||||
$ Match the end of the string
|
||||
|
||||
\b Match a word boundary
|
||||
|
||||
More examples to help you to understand.
|
||||
|
||||
- Example1: When pattern "^aaa" matches "xxx aaa xxx", match result: failed. Because "^"
|
||||
must match the beginning of the string. It could match successfully on condition that "aaa" is
|
||||
at the beginning of the string, such as "aaa xxx xxx".
|
||||
|
||||
- Example2: When pattern "aaa$" matches "xxx aaa xxx", match result: failed. Bacause "$"
|
||||
must match the end of the string. It could match successfully on condition that "aaa" is at the
|
||||
end of the string, such as "xxx xxx aaa".
|
||||
|
||||
- Example3: When pattern ".\b." matches "@@@abc", match result: success; substring
|
||||
matched: "@a"; position: starts at 2, ends at 4.
|
||||
Further explanation: "\b" is similar to "^" and "$", matches no character itself, but it require a
|
||||
'\w' character at its one side, another not '\w' character at the other side.
|
||||
|
||||
|
||||
- Example4: When pattern "\bend\b" matches "weekend,endfor,end", match result:
|
||||
success; substring matched: "end"; position: starts at 15, ends at 18.
|
||||
|
||||
Some special punctuation can make effect on other sub-patterns:
|
||||
|
||||
|
||||
Expression Function
|
||||
| Alternation, matches either left side or right side
|
||||
(1). Let sub-patterns in it to be a whole part when it is quantified.
|
||||
( )
|
||||
(2). Match result of sub-patterns in it can be retrieved individually
|
||||
|
||||
- Example5: When pattern "Tom|Jack" matches string "I'm Tom, he is Jack", match result:
|
||||
success; substring matched: "Tom"; position: starts at 4, ends at 7. When match next, match
|
||||
result: success; substring matched: "Jack"; position: starts at 15, ends at 19.
|
||||
|
||||
|
||||
- Example6: When pattern "(go\s*)+" matches "Let's go go go!", match result: success;
|
||||
substring matched: "go go go"; position: starts at 6, ends at 14.
|
||||
|
||||
- Example7: When pattern "?(\d+\.?\d*)" matches "$10.9,?20.5", match result: success;
|
||||
substring matched: "?20.5"; position: starts at 6, ends at 10. Match result of sub-patterns
|
||||
in "( )" is: "20.5".
|
||||
|
||||
|
||||
2. Regular expression advanced syntax
|
||||
|
||||
2.1 Reluctant or greedy quantifiers
|
||||
|
||||
There are serval method to quantify subpattern, such as: "{m,n}", "{m,}", "?", "*", "+". By
|
||||
default, a quantified subpattern is "greedy", that is, it will match as many times as possible
|
||||
(given a particular starting location) while still allowing the rest of the pattern to match. For
|
||||
example, to match "dxxxdxxxd":
|
||||
|
||||
Expression Match result
|
||||
|
||||
(d)(\w+) "\w+" matches all characters "xxxdxxxd" behind of "d"
|
||||
"\w+" matches all characters "xxxdxxx" between the first "d" and the last
|
||||
(d)(\w+)(d) "d". In order to let the whole pattern match success, "\w" has to give up the
|
||||
last "d", although it can match the last "d" too.
|
||||
|
||||
Thus it can be seen that: when "\w+" matches, it will match as many characters as possible.
|
||||
In the second example, it does not match the last "d", but this is in order to let the whole
|
||||
pattern match successfully. Pattern with "*" or "{m,n}" will also match as many times as
|
||||
possible, pattern with "?" will match if possible. This type of matching is called "greedy
|
||||
matching". ?
|
||||
|
||||
|
||||
Reluctant Matching:
|
||||
|
||||
To follow the quantifier with a "?", it can let the pattern to match the minimum number of
|
||||
times possible. This type of matching is called reluctant matching. In order to let the whole
|
||||
pattern match successfully, the reluctant pattern may match a few more times if it is required.
|
||||
For example, to match "dxxxdxxxd":
|
||||
|
||||
Expression Match result
|
||||
|
||||
(d)(\w+?) "\w+?" match as few times as possible, so "\w+?" matches only one "x"
|
||||
In order to let the whole pattern match successfully, "\w+?" has to match
|
||||
(d)(\w+?)(d)
|
||||
"xxx". So, match result is: "\w+?" matches "xxx"
|
||||
|
||||
More examples:
|
||||
|
||||
- Example1: When pattern "<td>(.*)</td>" matches "<td><p>aa</p></td>
|
||||
<td><p>bb</p></td>", match result: success; substring matched: the whole
|
||||
"<td><p>aa</p></td> <td><p>bb</p></td>", "</td>" in the pattern matches the last
|
||||
"</td>" in the string.
|
||||
|
||||
- Example2: For comparison, when pattern "<td>(.*?)</td>" matches the string in
|
||||
example1, it matches "<td><p>aa</p></td>". When match next, the next "<td><p>bb</p>
|
||||
</td>" can be matched.
|
||||
|
||||
|
||||
2.2 Referring to matched substring \1, \2...
|
||||
|
||||
During the process of matching, the match results of subpattern between parentheses "( )"
|
||||
are recorded for later use. When retrieving match results, those match result of subpattern can
|
||||
be retrieved individually, and this has been demonstrated many times in former examples. In
|
||||
practice, parentheses "( )" must be used to get what we want indeed after match, such as
|
||||
"<td>(.*?)</td>".
|
||||
|
||||
|
||||
In fact, those match result of subpattern between parentheses can be used not only after
|
||||
matching, but also during matching. The latter part of subpattern, can refer the match result of
|
||||
former subpattern. Usage: "\" plus a number to refer to the corresponding substring. "\1" refers
|
||||
to 1st pair of parentheses' match result, "\2" refers to 2nd pair of parentheses' match result.
|
||||
|
||||
Examples:
|
||||
|
||||
- Example1: When pattern "('|")(.*?)(\1)" matches " 'Hello', "World" ", match result: success;
|
||||
substring matched: " 'Hello' "; when match next, substring matched: " "World" ".
|
||||
|
||||
- Example2: When pattern "(\w)\1{4,}" matches "aa bbbb abcdefg ccccc 111121111
|
||||
999999999", match result: success; substring matched: "ccccc"; when match next, substring
|
||||
matched "999999999". This pattern require a character of "\w" to repeat at least 5 times.
|
||||
Pay attention to comparison with "\w{5,}".
|
||||
|
||||
- Example3: When pattern "<(\w+)\s*(\w+(=('|").*?\4)?\s*)*>.*?</\1>" matches "<td
|
||||
id='td1' style="bgcolor:white"></td>", match result: success. If both "<td>" and "</td>" are
|
||||
not "td", the match will fail.
|
||||
|
||||
|
||||
2.2b DEELX Regular Expression Replace Syntax
|
||||
|
||||
$1 ~ $999 - Stands for what a certain group captured. If the number is larger than the max group number,
|
||||
DEELX will use less digitals, till the number is smaller than or equal to the max group number.
|
||||
For example:
|
||||
If the max group number is 20, "$999" means "$9" and common string "99", while "$15" means the 15th group.
|
||||
If you need "$1" and common string "5", you can use $0015 , DEELX at most recognize 3 digitals as number.
|
||||
|
||||
${name} - Stands for what a named group captured.
|
||||
$$ - Stands for a single dollars ($).
|
||||
$& - Stands for what the overall expression captured.
|
||||
$` - The substring before the beginning of what the overall expression captured in the original text.
|
||||
$' - The substring behind the end of what the overall expression captured in the original text.
|
||||
$+ - Stands for what a group captured, which group has the max group number among those groups
|
||||
that have captured. For example: when "aaa(b+)|ccc(b+)" matches "aaabbb" ,
|
||||
$+ stands for $1, even though $2 has the max group number.
|
||||
$_ - Stands for the whole original text.
|
||||
|
||||
|
||||
2.3 Lookahead assertion; Lookbehind assertion
|
||||
In former chapters, I have introduced serval punctuations with special function:
|
||||
"^","$","\b". They all do not match any characters, but they all require certain conditions on
|
||||
their position. Now, this chapter will introduce more methods to add conditions on the gap
|
||||
between characters.
|
||||
|
||||
Lookahead assertion: "(?=xxxxx)", "(?!xxxxx)"
|
||||
|
||||
Format: "(?=xxxxx)", the condition which it add on the gap is that: string on the right side of
|
||||
the gap must be abe to match the subpattern "xxxxx" between the parentheses. It is just a
|
||||
condition, not a match operation, so there is no match result.
|
||||
|
||||
- Example1: When pattern "Windows (?=NT|XP)" matches "Windows 98, Windows NT,
|
||||
Windows 2000", it can match only "Windows " of "Windows NT", the other "Windows " could
|
||||
not be matched.
|
||||
|
||||
- Example2: When pattern "(\w)((?=\1\1\1)(\1))+" matches "aaa ffffff 999999999", it can
|
||||
match first 4 "f"s among the 6 "f"s, it can match first 7 "9"s among 9 "9"s.
|
||||
|
||||
Format: "(?!xxxxx)", string on the right side of the gap must not be able to match the
|
||||
subpattern "xxxxx".
|
||||
|
||||
- Example3: When pattern "((?!\bstop\b).)+" matches "fdjka ljfdl stop fjdsla fdj", it will
|
||||
match from the beginning of string to the position of "stop". If there is no "stop" in the string,
|
||||
the pattern will match the whole string.
|
||||
|
||||
|
||||
- Example4: When pattern "do(?!\w)" matches "done, do, dog", it can only match "do".
|
||||
Here, "(?!\w)" has the same effect as "\b".
|
||||
|
||||
Lookbehind assertion: "(?<=xxxxx)", "(?<!xxxxx)"
|
||||
|
||||
|
||||
The concepts of "Lookbehind assertion" and "Lookahead assertion" are similar. "(?<=xxxxx)"
|
||||
and "(?<!xxxxx)" require the string on the left side of the gap to be able to match or to be not
|
||||
able to match the subpattern, not the right side. And they will not match any characters
|
||||
themselves too.
|
||||
|
||||
Example5: When pattern "(?<=\d{4})\d+(?=\d{4})" matches "1234567890123456", it will
|
||||
match 8 numbers in the middle, except first 4 numbers and last 4 numbers. Because
|
||||
lookbehind assertion is not supported by JScript.RegExp, this example could not be
|
||||
demonstrated. There are many engines support lookbehind assertion, such as java.util.regex
|
||||
package in Java 1.4 or later, System.Text.RegularExpressions namespace in .NET platform, and
|
||||
DEELX Regexp Engine etc.
|
||||
|
||||
|
||||
3. Other usually supported rules
|
||||
|
||||
|
||||
There are several usually supported rules which have not been mentioned.
|
||||
|
||||
3.1 In pattern, a character can be expressed as "\xXX" or "\uXXXX" ("X" is a hex number)
|
||||
|
||||
|
||||
Format Character range
|
||||
\xXX 0 ~ 255, such as space can be "\x20"
|
||||
|
||||
\uXXXX Any character can be expressed as "\u" plus 4 hex numbers, such as "\u4E2D"
|
||||
|
||||
3.2 While "\s", "\d", "\w", "\b" are specially defined, their uppercase letters have the opposite
|
||||
meaning
|
||||
|
||||
|
||||
Pattern Matches
|
||||
\S All characters except spaces
|
||||
\D All characters except numeric characters
|
||||
|
||||
\W All characters except alpha, numeric, "_"
|
||||
|
||||
\B Characters' gap which is not a word boundary
|
||||
|
||||
3.3 Specially defined characters table
|
||||
|
||||
Character Description
|
||||
^ Matches the beginning of the string. Use "\^" to match "^" itself
|
||||
|
||||
$ Matches the end of the string. Use "\$" to match "$" itself
|
||||
( ) Grouping. Use "\(" and "\)" to match "(" and ")"
|
||||
[ ] Character class. Use "\[" and "\]" to match "[" and "]"
|
||||
|
||||
{ } Define quantifiers. Use "\{" and "\}" to match "{" and "}"
|
||||
. Match any character except newline(\n). Use "\." to match "." itself
|
||||
? Let subpattern match 0 or 1 time. Use "\?" to match "?" itself
|
||||
+ Let subpattern match at least 1 times. Use "\+" to match "+" itself
|
||||
* Let subpattern match any times. Use "\*" to match "*" itself
|
||||
|
||||
| Alternation. Use "\|" to match "|" itself
|
||||
|
||||
3.4 If a subpattern is in "(?:xxxxx)", the match result is not recorded for later use.
|
||||
|
||||
- Example1: When pattern "(?:(\w)\1)+" matches "a bbccdd efg", the substring matched:
|
||||
"bbccdd". The match result of subpattern in "(?:)" is not recorded, so "\1" is used to refer to the
|
||||
match result of "(\w)".
|
||||
|
||||
3.5 Pattern attribute: Ignorecase,Singleline,Multiline,Global
|
||||
|
||||
Attribute Description
|
||||
|
||||
Ignorecase Do case-insensitive pattern matching. Default is case-sensitive.
|
||||
Treat string as single line. That is, change "." to match any character
|
||||
Singleline
|
||||
whatsoever, even a newline, which it normally would not match.
|
||||
Treat string as multiple lines. Default is that "^" and "$" match at only the very
|
||||
start? of the string and end? of the string. If multiline, they match the
|
||||
start? of any line and end? of any line within the string:
|
||||
Multiline
|
||||
|
||||
?xxxxxxxxx?\n
|
||||
?xxxxxxxxx?
|
||||
|
||||
Global Replace all matches if the pattern is used in replace operation.
|
||||
|
||||
|
||||
|
||||
4. Integrated prompt
|
||||
|
||||
|
||||
4.1 If you want to know what else are implemented by advanced engines, you can refer to
|
||||
DEELX Syntax on this site.
|
||||
|
||||
4.2 If the pattern is required to match the whole string, not a part of string, we may use "^" and
|
||||
"$", such as: "^\d+$" require the whole string consist of digit characters.
|
||||
|
||||
4.3 If the pattern is required to match a whole word, not a part of word, we may use "\b" at the
|
||||
beginning and the end of the pattern, such as: use "\b(if|while|else|void|int……)\b" to match
|
||||
keywords in a program.
|
||||
|
||||
4.4 Do not let pattern match empty string "". Or you will get an empty substring matched,
|
||||
while the match operation returns success. For example: if we need a pattern to match
|
||||
"123"?"123."?"123.5"?".5", we should not use this pattern "\d*\.?\d*". Though there is
|
||||
nothing, we may still get a success. Proper pattern: "\d+\.?\d*|\.\d+".
|
||||
|
||||
4.5 Do not let a subpattern loop infinite times if the subpattern can match empty string.
|
||||
|
||||
4.6 Choose reluctant or greedy quantifier properly.
|
||||
4.7 Only one side of "|" to match a certain character.
|
||||
|
||||
Author: sswater shi.
|
||||
|
||||
RegExLab.com © 2005 - 2016 All Rights Reserved
|
||||
@ -0,0 +1,2 @@
|
||||
[InternetShortcut]
|
||||
URL=http://www.regexlab.com/en/regref.htm
|
||||
BIN
scintilla/deelx/doc/deelx_en.chm
Normal file
BIN
scintilla/deelx/doc/deelx_en.chm
Normal file
Binary file not shown.
4665
scintilla/deelx/doc/orig_src/deelx12.h
Normal file
4665
scintilla/deelx/doc/orig_src/deelx12.h
Normal file
File diff suppressed because it is too large
Load Diff
4804
scintilla/deelx/doc/orig_src/deelx13.h
Normal file
4804
scintilla/deelx/doc/orig_src/deelx13.h
Normal file
File diff suppressed because it is too large
Load Diff
@ -1356,7 +1356,7 @@ END
|
||||
STRINGTABLE
|
||||
BEGIN
|
||||
IDS_BACKSLASHHELP "Backslash Transformations\n\n\\a\tAlert (BEL, Ascii 7)\n\\b\tBackspace (BS, Ascii 8)\n\\f\tFormfeed (FF, Ascii 12)\n\\n\tNewline (LF, Ascii 10)\n\\r\tCarriage return (CR, Ascii 13)\n\\t\tHorizontal Tab (HT, Ascii 9)\n\\v\tVertical Tab (VT, Ascii 11)\n\\ooo\tOctal Value\n\\u####\tHexadecimal Value\n\\xhh\tHexadecimal Value\n\\\\\tBackslash"
|
||||
IDS_REGEXPHELP "RegExp Syntax (Single Lines Only)\n\n.\tAny character\n^\tStart of a line\n$\tEnd of a line\n\\<\tStart of a word\n\\>\tEnd of a word\n[...]\tA set of chars ([abc]) or a range ([a-z])\n[^...]\tChars NOT in the set or range\n\\d\tAny decimal digit\n\\D\tAny non-digit char\n\\s\tAny whitespace char\n\\S\tNot a whitespace char\n\\w\tAny ""word"" char\n\\W\tAny ""non-word"" char\n\\x\tEscape character with otherwise special meaning\n\\xHH\tChar with hex code HH\n?\tMatches preceding 0 or 1 times\n*\tMatches preceding 0 or more times\n+\tMatches preceding 1 or more times\n*? or +?\tNon greedy matching of quantifiers ""?"" and ""+""\n(\tStart of a region\n)\tEnd of a region\n\\n\tRefers to a region when replacing (n is 1-9)\n"
|
||||
IDS_REGEXPHELP "RegExp Syntax (Multi Lines)\n\n.\tAny character, except line-breaks\n^\tStart of a line\n$\tEnd of a line\n\\<\tStart of a word\n\\>\tEnd of a word\n[...]\tA set of chars ([abc]) or a range ([a-z])\n[^...]\tChars NOT in the set or range\n\\d\tAny decimal digit\n\\D\tAny non-digit char\n\\s\tAny whitespace char\n\\S\tNot a whitespace char\n\\w\tAny ""word"" char\n\\W\tAny ""non-word"" char\n\\x\tEscape character with otherwise special meaning\n\\xHH\tChar with hex code HH\n?\tMatches preceding 0 or 1 times\n*\tMatches preceding 0 or more times\n+\tMatches preceding 1 or more times\n*? or +?\tNon greedy matching of quantifiers ""?"" and ""+""\n(\tStart of a region\n)\tEnd of a region\n\\n\tRefers to a region when replacing (n is 1-9)\n"
|
||||
IDS_WILDCARDHELP "Wildcard Search\n\n*\tMatches zero or more characters.\n?\tMatches exactly one character. "
|
||||
END
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user