[patch proposal] regex handling in XWiki core
Pablo Oliveira
pablo.oliveira at enst.fr
Tue Feb 13 09:54:05 CET 2007
Hi,
As some of you may know, I am currently working on a version of XWiki for
mobile devices.
I have been investigating the possibility of running some parts of XWiki
on a J2ME - CDC PP configuration.
During this process I have noticed that XWiki uses two different api for
matching regular expressions:
* Jakarta ORO
* java.util.regex ( JDK > 1.4 )
Because j2me does not have the java.util.regex classes, I have made
some small changes so that the core of XWiki only uses Jakarta ORO,
so I can continue my tests.
Yet, I think these changes (see attached patch) may be of a more general
interest because:
* it might be cleaner to stick to a single regex lib
* this patch factors some of the regex handling on an
encapsulating class that would allow us to change the regex
underlying implementation more easily.
What do you think?
Regards,
Pablo
-------------- next part --------------
Index: core/src/main/java/com/xpn/xwiki/render/XWikiMacrosMappingRenderer.java
===================================================================
--- core/src/main/java/com/xpn/xwiki/render/XWikiMacrosMappingRenderer.java (revision 2081)
+++ core/src/main/java/com/xpn/xwiki/render/XWikiMacrosMappingRenderer.java (working copy)
@@ -29,13 +29,13 @@
import com.xpn.xwiki.notify.DocChangeRule;
import com.xpn.xwiki.notify.XWikiDocChangeNotificationInterface;
import com.xpn.xwiki.notify.XWikiNotificationRule;
+import com.xpn.xwiki.util.RegexIterator;
+
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.util.HashMap;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
public class XWikiMacrosMappingRenderer implements XWikiRenderer, XWikiDocChangeNotificationInterface {
private static final Log log = LogFactory.getLog(XWikiMacrosMappingRenderer.class);
@@ -91,16 +91,16 @@
private String convertSingleLines(String content, XWikiContext context) {
StringBuffer result = new StringBuffer();
- String regexp = "\\{(\\w+)(:(.+))?\\}";
- Pattern p = Pattern.compile(regexp);
- Matcher m = p.matcher(content);
+ final String pattern = "\\{(\\w+)(:(.+))?\\}";
+ RegexIterator re = new RegexIterator(pattern, content);
+
int current = 0;
- while (m.find()) {
- result.append(content.substring(current, m.start()));
- current = m.end();
- String macroname = m.group(1);
- String params = m.group(3);
- String allcontent = m.group(0);
+ while (re.nextMatch()) {
+ result.append(content.substring(current, re.begin(0)));
+ current = re.end(0);
+ String macroname = re.group(1);
+ String params = re.group(3);
+ String allcontent = re.group(0);
XWikiVirtualMacro macro = (XWikiVirtualMacro) macros_mappings.get(macroname);
if ((macro!=null)&&(macro.isSingleLine()))
@@ -117,17 +117,17 @@
private String convertMultiLines(String content, XWikiContext context) {
StringBuffer result = new StringBuffer();
- String regexp = "\\{(\\w+)(:(.+))?\\}(.+?)\\{\\1\\}";
- Pattern p = Pattern.compile(regexp);
- Matcher m = p.matcher(content);
+ final String pattern = "\\{(\\w+)(:(.+))?\\}(.+?)\\{\\1\\}";
+ RegexIterator re = new RegexIterator(pattern, content);
+
int current = 0;
- while (m.find()) {
- result.append(content.substring(current, m.start()));
- current = m.end();
- String macroname = m.group(1);
- String params = m.group(3);
- String data = m.group(4);
- String allcontent = m.group(0);
+ while (re.nextMatch()) {
+ result.append(content.substring(current, re.begin(0)));
+ current = re.end(0);
+ String macroname = re.group(1);
+ String params = re.group(3);
+ String data = re.group(4);
+ String allcontent = re.group(0);
XWikiVirtualMacro macro = (XWikiVirtualMacro) macros_mappings.get(macroname);
if ((macro!=null)&&(macro.isMultiLine()))
Index: core/src/main/java/com/xpn/xwiki/atom/WSSEHttpHeader.java
===================================================================
--- core/src/main/java/com/xpn/xwiki/atom/WSSEHttpHeader.java (revision 2081)
+++ core/src/main/java/com/xpn/xwiki/atom/WSSEHttpHeader.java (working copy)
@@ -24,6 +24,7 @@
*/
package com.xpn.xwiki.atom;
+import com.xpn.xwiki.util.RegexIterator;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.codec.digest.DigestUtils;
@@ -32,8 +33,6 @@
import java.util.Calendar;
import java.util.GregorianCalendar;
import java.util.TimeZone;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
/**
* @author Luis
@@ -41,20 +40,23 @@
*/
public class WSSEHttpHeader {
- private static String userNameExpression = "Username=\"([a-zA-Z]+)\"";
+ private static final String userNameExpression = "Username=\"([a-zA-Z]+)\"";
- private static String nonceExpression = "Nonce=\"([A-Za-z0-9+/=]+)\"";
+ private static final String nonceExpression = "Nonce=\"([A-Za-z0-9+/=]+)\"";
- private static String passwordDigestExpression = "PasswordDigest=\"([A-Za-z0-9+/=]+)\"";
+ private static final String passwordDigestExpression = "PasswordDigest=\"([A-Za-z0-9+/=]+)\"";
- private static String createdExpression = "Created=\"([0-9:\\-+TZ]+)\"";
+ private static final String createdExpression = "Created=\"([0-9:\\-+TZ]+)\"";
- private static String parseCreatedExpression = "(\\d{4})(?:-?(\\d{2})(?:-?(\\d\\d?)(?:T(\\d{2}):(\\d{2}):(\\d{2})(?:\\.\\d+)?(?:(Z)|([+-]\\d{2}:\\d{2}))?)?)?)?";
+ private static final String parseCreatedExpression =
+ "(\\d{4})(?:-?(\\d{2})(?:-?(\\d\\d?)(?:T(\\d{2}):(\\d{2}):(\\d{2})(?:\\.\\d+)?(?:(Z)|([+-]\\d{2}:\\d{2}))?)?)?)?";
- private static Pattern headerPattern;
+ private static final String headerExpression = "UsernameToken "
+ + userNameExpression + ", "
+ + passwordDigestExpression + ", "
+ + nonceExpression + ", "
+ + createdExpression ;
- private static Pattern createdPattern = Pattern.compile(parseCreatedExpression);
-
private static SimpleDateFormat df = new SimpleDateFormat( "yyyy-MM-dd'T'HH:mm:ssZ" );
private String userName;
@@ -123,13 +125,13 @@
public static WSSEHttpHeader parseHttpHeader(String httpHeader) throws IOException {
WSSEHttpHeader wsseHeader = null;
- Matcher matcher = getHeaderPattern().matcher(httpHeader);
- if (matcher.matches()) {
+ RegexIterator re = new RegexIterator(headerExpression, httpHeader);
+ if (re.nextMatch()) {
wsseHeader = new WSSEHttpHeader();
- wsseHeader.setUserName(matcher.group(1));
- wsseHeader.setPasswordDigest(matcher.group(2));
- wsseHeader.setNonce(matcher.group(3));
- wsseHeader.setCreated(matcher.group(4));
+ wsseHeader.setUserName(re.group(1));
+ wsseHeader.setPasswordDigest(re.group(2));
+ wsseHeader.setNonce(re.group(3));
+ wsseHeader.setCreated(re.group(4));
if (!Base64.isArrayByteBase64(wsseHeader.getPasswordDigest().getBytes())) {
throw new IOException("Invalid Password Digest : " + wsseHeader.getPasswordDigest());
@@ -187,24 +189,6 @@
public static String getParseCreatedExpression() {
return parseCreatedExpression;
}
-
- /**
- * @return Returns the headerPattern.
- */
- public static Pattern getHeaderPattern() {
- if (headerPattern == null) {
- StringBuffer sb = new StringBuffer("UsernameToken ");
- sb.append(userNameExpression);
- sb.append(", ");
- sb.append(passwordDigestExpression);
- sb.append(", ");
- sb.append(nonceExpression);
- sb.append(", ");
- sb.append(createdExpression);
- headerPattern = Pattern.compile(sb.toString());
- }
- return headerPattern;
- }
public Calendar parseCreated() {
return parseCreated(created);
@@ -212,35 +196,35 @@
public static Calendar parseCreated(String w3CDTFValue) {
Calendar cal = null;
- Matcher matcher = createdPattern.matcher(w3CDTFValue);
- if (matcher.matches()) {
+ RegexIterator re = new RegexIterator(parseCreatedExpression, w3CDTFValue);
+ if (re.nextMatch()) {
String value;
int year;
int day;
int month;
- year = Integer.parseInt(matcher.group(1));
- if ((value = matcher.group(2)) == null) {
+ year = Integer.parseInt(re.group(1));
+ if ((value = re.group(2)) == null) {
month = Calendar.JANUARY;
} else {
month = Integer.parseInt(value) - (1 - Calendar.JANUARY);
}
- if ((value = matcher.group(3)) == null) {
+ if ((value = re.group(3)) == null) {
day = 1;
} else {
day = Integer.parseInt(value);
}
cal = new GregorianCalendar(year, month, day);
- String timeZone = matcher.group(7);
+ String timeZone = re.group(7);
if (timeZone != null && !timeZone.equals("Z")) {
cal.setTimeZone(TimeZone.getTimeZone(timeZone));
}
- if ((value = matcher.group(4)) != null) {
+ if ((value = re.group(4)) != null) {
cal.set(Calendar.HOUR_OF_DAY, Integer.parseInt(value));
}
- if ((value = matcher.group(5)) != null) {
+ if ((value = re.group(5)) != null) {
cal.set(Calendar.MINUTE, Integer.parseInt(value));
}
- if ((value = matcher.group(6)) != null) {
+ if ((value = re.group(6)) != null) {
cal.set(Calendar.SECOND, Integer.parseInt(value));
}
}
Index: core/src/main/java/com/xpn/xwiki/doc/XWikiDocument.java
===================================================================
--- core/src/main/java/com/xpn/xwiki/doc/XWikiDocument.java (revision 2081)
+++ core/src/main/java/com/xpn/xwiki/doc/XWikiDocument.java (working copy)
@@ -39,6 +39,7 @@
import com.xpn.xwiki.store.XWikiStoreInterface;
import com.xpn.xwiki.store.XWikiVersioningStoreInterface;
import com.xpn.xwiki.util.Util;
+import com.xpn.xwiki.util.RegexIterator;
import com.xpn.xwiki.validation.XWikiValidationInterface;
import com.xpn.xwiki.validation.XWikiValidationStatus;
import com.xpn.xwiki.web.EditForm;
@@ -49,7 +50,6 @@
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.ecs.filter.CharacterFilter;
-import org.apache.oro.text.regex.MalformedPatternException;
import org.apache.velocity.VelocityContext;
import org.apache.velocity.app.tools.VelocityFormatter;
import org.dom4j.Document;
@@ -85,8 +85,6 @@
import java.util.Map;
import java.util.Set;
import java.util.Vector;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
@@ -2405,7 +2403,7 @@
try {
String pattern =
"#include(Topic|InContext|Form|Macros|parseGroovyFromPage)\\([\"'](.*?)[\"']\\)";
- List list = context.getUtil().getMatches(getContent(), pattern, 2);
+ List list = Util.getMatches(getContent(), pattern, 2);
for (int i = 0; i < list.size(); i++) {
try {
String name = (String) list.get(i);
@@ -2437,7 +2435,7 @@
try {
String pattern = "\\[(.*?)\\]";
List newlist = new ArrayList();
- List list = context.getUtil().getMatches(getContent(), pattern, 1);
+ List list = Util.getMatches(getContent(), pattern, 1);
for (int i = 0; i < list.size(); i++) {
try {
String name = (String) list.get(i);
@@ -3483,15 +3481,8 @@
String htmlregexp =
"</?(html|body|img|a|i|b|embed|script|form|input|textarea|object|font|li|ul|ol|table|center|hr|br|p) ?([^>]*)>";
- try {
- Util util = new Util();
- List list = util.getMatches(content2, htmlregexp, 1);
- if (list.size() > 0) {
- return true;
- }
- } catch (MalformedPatternException e) {
- }
- return false;
+ RegexIterator re = new RegexIterator(htmlregexp, content2);
+ return re.nextMatch();
}
public boolean isProgrammaticContent()
@@ -3555,21 +3546,21 @@
public List getSplitSectionsAccordingToTitle() throws XWikiException
{
// pattern to match the title
- Pattern pattern =
- Pattern.compile("^[\\p{Space}]*(1(\\.1)*)[\\p{Space}]+(.*?)$", Pattern.MULTILINE);
- Matcher matcher = pattern.matcher(getContent());
+ final String pattern = "(?m)^[\\s]*(1(\\.1)*)[\\s]+(.*?)$";
+ String contentTemp = getContent();
+ RegexIterator re = new RegexIterator(pattern, contentTemp);
+
List splitSections = new ArrayList();
int sectionNumber = 0;
- String contentTemp = getContent();
int beforeIndex = 0;
- while (matcher.find()) { // find title to split
- String sectionLevel = matcher.group(1);
+ while (re.nextMatch()) { // find title to split
+ String sectionLevel = re.group(1);
if (sectionLevel.equals("1") || sectionLevel.equals("1.1")) {
// only set editting for the title that is 1 or 1.1
sectionNumber++;
- String sectionTitle = matcher.group(3);
- int sectionIndex = contentTemp.indexOf(matcher.group(0), beforeIndex);
- beforeIndex = sectionIndex + matcher.group(0).length();
+ String sectionTitle = re.group(3);
+ int sectionIndex = contentTemp.indexOf(re.group(0), beforeIndex);
+ beforeIndex = sectionIndex + re.group(0).length();
// initialize a documentSection object
DocumentSection docSection =
new DocumentSection(sectionNumber, sectionIndex, sectionLevel, sectionTitle);
Index: core/src/main/java/com/xpn/xwiki/XWiki.java
===================================================================
--- core/src/main/java/com/xpn/xwiki/XWiki.java (revision 2081)
+++ core/src/main/java/com/xpn/xwiki/XWiki.java (working copy)
@@ -4398,7 +4398,7 @@
{
try {
String pattern = "#includeMacros\\(\"(.*?)\"\\)";
- List list = context.getUtil().getMatches(content, pattern, 1);
+ List list = Util.getMatches(content, pattern, 1);
for (int i = 0; i < list.size(); i++) {
try {
String name = (String) list.get(i);
Index: core/src/main/java/com/xpn/xwiki/util/RegexIterator.java
===================================================================
--- core/src/main/java/com/xpn/xwiki/util/RegexIterator.java (revision 0)
+++ core/src/main/java/com/xpn/xwiki/util/RegexIterator.java (revision 0)
@@ -0,0 +1,53 @@
+package com.xpn.xwiki.util;
+
+import org.apache.oro.text.regex.MatchResult;
+import org.apache.oro.text.regex.Pattern;
+import org.apache.oro.text.regex.PatternMatcher;
+import org.apache.oro.text.regex.PatternMatcherInput;
+
+/**
+ * This simple class iterate over the matches of a regular expression.
+ * The aim is to encapsulate the underlying regex engine, so it can be changed
+ * more easily.
+ */
+
+public class RegexIterator
+{
+ private final Pattern pattern;
+ private final PatternMatcher matcher;
+ private final PatternMatcherInput input;
+ private MatchResult match;
+
+ public RegexIterator(String spattern, String sinput)
+ {
+ pattern = Util.getPatterns().getPattern(spattern);
+ matcher = new Util().getMatcher();
+ input = new PatternMatcherInput(sinput);
+ }
+
+ public boolean nextMatch()
+ {
+ if (matcher.contains(input, pattern))
+ {
+ match = matcher.getMatch();
+ return true;
+ }
+ else
+ return false;
+ }
+
+ public String group(int g)
+ {
+ return match.group(g);
+ }
+
+ public int begin(int g)
+ {
+ return match.beginOffset(g);
+ }
+
+ public int end (int g)
+ {
+ return match.endOffset(g);
+ }
+}
Index: core/src/main/java/com/xpn/xwiki/util/TOCGenerator.java
===================================================================
--- core/src/main/java/com/xpn/xwiki/util/TOCGenerator.java (revision 2090)
+++ core/src/main/java/com/xpn/xwiki/util/TOCGenerator.java (working copy)
@@ -28,10 +28,7 @@
import org.apache.commons.collections.map.ListOrderedMap;
import java.util.*;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
public class TOCGenerator {
public static final String TOC_DATA_NUMBERING = "numbering";
public static final String TOC_DATA_LEVEL = "level";
@@ -43,11 +40,12 @@
List processedHeadings = new ArrayList();
int previousNumbers[] = { 0, 0, 0, 0, 0, 0, 0 };
- Pattern pattern = Pattern.compile("^[\\p{Space}]*(1(\\.1)*)[\\p{Space}]+(.*?)$", Pattern.MULTILINE);
- Matcher matcher = pattern.matcher(content);
- while (matcher.find()) {
- int level = (matcher.group(1).lastIndexOf("1") + 2) / 2;
- String text = matcher.group(3);
+ final String pattern = "(?m)^[\\s]*(1(\\.1)*)[\\s]+(.*?)$";
+ RegexIterator re = new RegexIterator (pattern, content);
+
+ while (re.nextMatch()) {
+ int level = (re.group(1).lastIndexOf("1") + 2) / 2;
+ String text = re.group(3);
int occurence = 0;
for (Iterator iter = processedHeadings.iterator(); iter.hasNext();) if (iter.next().equals(text)) occurence++;
Index: core/src/main/java/com/xpn/xwiki/util/Util.java
===================================================================
--- core/src/main/java/com/xpn/xwiki/util/Util.java (revision 2081)
+++ core/src/main/java/com/xpn/xwiki/util/Util.java (working copy)
@@ -43,7 +43,6 @@
import java.util.*;
public class Util {
-
private static PatternCache patterns = new PatternCacheLRU(200);
private Perl5Matcher matcher = new Perl5Matcher();
private Perl5Util p5util = new Perl5Util(getPatterns());
@@ -73,21 +72,23 @@
public Perl5Util getP5util() {
return p5util;
}
-
- public List getMatches(String content, String spattern, int group) throws MalformedPatternException {
+
+ /**
+ * @return A List of strings corresponding to the matches of spattern over content. Only the given
+ * group is returned. Duplicate matches are discarded.
+ *
+ */
+ public static List getMatches(String content, String spattern, int group) throws MalformedPatternException {
List list = new ArrayList();
- PatternMatcherInput input = new PatternMatcherInput(content);
- Pattern pattern = patterns.addPattern(spattern);
- while (matcher.contains(input, pattern)) {
- MatchResult result = matcher.getMatch();
- String smatch = result.group(group);
+ RegexIterator re = new RegexIterator (spattern, content);
+ while (re.nextMatch()) {
+ String smatch = re.group(group);
if (!list.contains(smatch))
list.add(smatch);
}
return list;
}
-
public static String cleanValue(String value) {
value = StringUtils.replace(value,"\r\r\n", "%_N_%");
value = StringUtils.replace(value,"\r\n", "%_N_%");
More information about the devs
mailing list