[patch proposal] regex handling in XWiki core

Pablo Oliveira pablo.oliveira at enst.fr
Tue Feb 13 09:54:05 CET 2007


Hi,

As some of you may know, I am currently working on a version of XWiki for 
mobile devices.
I have been investigating the possibility of running some parts of XWiki
on a J2ME - CDC PP configuration. 

During this process I have noticed that XWiki uses two different api for
matching regular expressions:
   * Jakarta ORO
   * java.util.regex ( JDK > 1.4 )

Because j2me does not have the java.util.regex classes, I have made 
some small changes so that the core of XWiki only uses Jakarta ORO, 
so I can continue my tests.

Yet, I think these changes (see attached patch) may be of a more general 
interest because:
   * it might be cleaner to stick to a single regex lib
   * this patch factors some of the regex handling on an
     encapsulating class that would allow us to change the regex
     underlying implementation more easily.

What do you think?

Regards,
Pablo
-------------- next part --------------
Index: core/src/main/java/com/xpn/xwiki/render/XWikiMacrosMappingRenderer.java
===================================================================
--- core/src/main/java/com/xpn/xwiki/render/XWikiMacrosMappingRenderer.java	(revision 2081)
+++ core/src/main/java/com/xpn/xwiki/render/XWikiMacrosMappingRenderer.java	(working copy)
@@ -29,13 +29,13 @@
 import com.xpn.xwiki.notify.DocChangeRule;
 import com.xpn.xwiki.notify.XWikiDocChangeNotificationInterface;
 import com.xpn.xwiki.notify.XWikiNotificationRule;
+import com.xpn.xwiki.util.RegexIterator;
+
 import org.apache.commons.lang.StringUtils;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 
 import java.util.HashMap;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 
 public class XWikiMacrosMappingRenderer implements XWikiRenderer, XWikiDocChangeNotificationInterface {
     private static final Log log = LogFactory.getLog(XWikiMacrosMappingRenderer.class);
@@ -91,16 +91,16 @@
 
     private String convertSingleLines(String content, XWikiContext context) {
         StringBuffer result = new StringBuffer();
-        String regexp = "\\{(\\w+)(:(.+))?\\}";
-        Pattern p = Pattern.compile(regexp);
-        Matcher m = p.matcher(content);
+        final String pattern = "\\{(\\w+)(:(.+))?\\}";
+        RegexIterator re = new RegexIterator(pattern, content);
+        
         int current = 0;
-        while (m.find()) {
-            result.append(content.substring(current, m.start()));
-            current = m.end();
-            String macroname = m.group(1);
-            String params = m.group(3);
-            String allcontent = m.group(0);
+        while (re.nextMatch()) {
+            result.append(content.substring(current, re.begin(0)));
+            current = re.end(0);
+            String macroname = re.group(1);
+            String params = re.group(3);
+            String allcontent = re.group(0);
 
             XWikiVirtualMacro macro = (XWikiVirtualMacro) macros_mappings.get(macroname);
             if ((macro!=null)&&(macro.isSingleLine()))
@@ -117,17 +117,17 @@
 
     private String convertMultiLines(String content, XWikiContext context) {
         StringBuffer result = new StringBuffer();
-        String regexp = "\\{(\\w+)(:(.+))?\\}(.+?)\\{\\1\\}";
-        Pattern p = Pattern.compile(regexp);
-        Matcher m = p.matcher(content);
+        final String pattern = "\\{(\\w+)(:(.+))?\\}(.+?)\\{\\1\\}";
+        RegexIterator re = new RegexIterator(pattern, content);
+        
         int current = 0;
-        while (m.find()) {
-            result.append(content.substring(current, m.start()));
-            current = m.end();
-            String macroname = m.group(1);
-            String params = m.group(3);
-            String data = m.group(4);
-            String allcontent = m.group(0);
+        while (re.nextMatch()) {
+            result.append(content.substring(current, re.begin(0)));
+            current = re.end(0);
+            String macroname = re.group(1);
+            String params = re.group(3);
+            String data = re.group(4);
+            String allcontent = re.group(0);
 
             XWikiVirtualMacro macro = (XWikiVirtualMacro) macros_mappings.get(macroname);
             if ((macro!=null)&&(macro.isMultiLine()))
Index: core/src/main/java/com/xpn/xwiki/atom/WSSEHttpHeader.java
===================================================================
--- core/src/main/java/com/xpn/xwiki/atom/WSSEHttpHeader.java	(revision 2081)
+++ core/src/main/java/com/xpn/xwiki/atom/WSSEHttpHeader.java	(working copy)
@@ -24,6 +24,7 @@
  */
 package com.xpn.xwiki.atom;
 
+import com.xpn.xwiki.util.RegexIterator;
 import org.apache.commons.codec.binary.Base64;
 import org.apache.commons.codec.digest.DigestUtils;
 
@@ -32,8 +33,6 @@
 import java.util.Calendar;
 import java.util.GregorianCalendar;
 import java.util.TimeZone;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 
 /**
  * @author Luis
@@ -41,20 +40,23 @@
  */
 public class WSSEHttpHeader {
   
-  private static String userNameExpression = "Username=\"([a-zA-Z]+)\"";
+  private static final String userNameExpression = "Username=\"([a-zA-Z]+)\"";
   
-  private static String nonceExpression = "Nonce=\"([A-Za-z0-9+/=]+)\"";
+  private static final String nonceExpression = "Nonce=\"([A-Za-z0-9+/=]+)\"";
   
-  private static String passwordDigestExpression = "PasswordDigest=\"([A-Za-z0-9+/=]+)\"";
+  private static final String passwordDigestExpression = "PasswordDigest=\"([A-Za-z0-9+/=]+)\"";
   
-  private static String createdExpression = "Created=\"([0-9:\\-+TZ]+)\"";
+  private static final String createdExpression = "Created=\"([0-9:\\-+TZ]+)\"";
   
-  private static String parseCreatedExpression = "(\\d{4})(?:-?(\\d{2})(?:-?(\\d\\d?)(?:T(\\d{2}):(\\d{2}):(\\d{2})(?:\\.\\d+)?(?:(Z)|([+-]\\d{2}:\\d{2}))?)?)?)?";
+  private static final String parseCreatedExpression = 
+      "(\\d{4})(?:-?(\\d{2})(?:-?(\\d\\d?)(?:T(\\d{2}):(\\d{2}):(\\d{2})(?:\\.\\d+)?(?:(Z)|([+-]\\d{2}:\\d{2}))?)?)?)?";
   
-  private static Pattern headerPattern;
+  private static final String headerExpression = "UsernameToken " 
+                                               + userNameExpression + ", " 
+                                               + passwordDigestExpression + ", " 
+                                               + nonceExpression + ", " 
+                                               + createdExpression ;
   
-  private static Pattern createdPattern = Pattern.compile(parseCreatedExpression);
-  
   private static SimpleDateFormat df = new SimpleDateFormat( "yyyy-MM-dd'T'HH:mm:ssZ" );
 
   private String userName;
@@ -123,13 +125,13 @@
 
   public static WSSEHttpHeader parseHttpHeader(String httpHeader) throws IOException {
     WSSEHttpHeader wsseHeader = null;
-    Matcher matcher = getHeaderPattern().matcher(httpHeader);
-    if (matcher.matches()) {
+    RegexIterator re = new RegexIterator(headerExpression, httpHeader);
+    if (re.nextMatch()) {
         wsseHeader = new WSSEHttpHeader();
-        wsseHeader.setUserName(matcher.group(1));
-        wsseHeader.setPasswordDigest(matcher.group(2));
-        wsseHeader.setNonce(matcher.group(3));
-        wsseHeader.setCreated(matcher.group(4));
+        wsseHeader.setUserName(re.group(1));
+        wsseHeader.setPasswordDigest(re.group(2));
+        wsseHeader.setNonce(re.group(3));
+        wsseHeader.setCreated(re.group(4));
         
         if (!Base64.isArrayByteBase64(wsseHeader.getPasswordDigest().getBytes())) {
           throw new IOException("Invalid Password Digest : " + wsseHeader.getPasswordDigest());
@@ -187,24 +189,6 @@
   public static String getParseCreatedExpression() {
     return parseCreatedExpression;
   }
-
-  /**
-   * @return Returns the headerPattern.
-   */
-  public static Pattern getHeaderPattern() {
-    if (headerPattern == null) {
-      StringBuffer sb = new StringBuffer("UsernameToken ");
-      sb.append(userNameExpression);
-      sb.append(", ");
-      sb.append(passwordDigestExpression);
-      sb.append(", ");
-      sb.append(nonceExpression);
-      sb.append(", ");
-      sb.append(createdExpression);
-      headerPattern = Pattern.compile(sb.toString());      
-    }
-    return headerPattern;
-  }
   
   public Calendar parseCreated() {
     return parseCreated(created);
@@ -212,35 +196,35 @@
 
   public static Calendar parseCreated(String w3CDTFValue) {
     Calendar cal = null;
-    Matcher matcher = createdPattern.matcher(w3CDTFValue);
-    if (matcher.matches()) {
+    RegexIterator re = new RegexIterator(parseCreatedExpression, w3CDTFValue);
+    if (re.nextMatch()) {
       String value;
       int year;
       int day;
       int month;
-      year = Integer.parseInt(matcher.group(1));
-      if ((value = matcher.group(2)) == null) {
+      year = Integer.parseInt(re.group(1));
+      if ((value = re.group(2)) == null) {
         month = Calendar.JANUARY;
       } else {
         month = Integer.parseInt(value) - (1 - Calendar.JANUARY);
       }
-      if ((value = matcher.group(3)) == null) {
+      if ((value = re.group(3)) == null) {
         day = 1;
       } else {
         day = Integer.parseInt(value);
       }
       cal = new GregorianCalendar(year, month, day);
-      String timeZone = matcher.group(7);
+      String timeZone = re.group(7);
       if (timeZone != null && !timeZone.equals("Z")) {
         cal.setTimeZone(TimeZone.getTimeZone(timeZone));
       }      
-      if ((value = matcher.group(4)) != null) {
+      if ((value = re.group(4)) != null) {
         cal.set(Calendar.HOUR_OF_DAY, Integer.parseInt(value));
       }
-      if ((value = matcher.group(5)) != null) {
+      if ((value = re.group(5)) != null) {
         cal.set(Calendar.MINUTE, Integer.parseInt(value));
       }
-      if ((value = matcher.group(6)) != null) {
+      if ((value = re.group(6)) != null) {
         cal.set(Calendar.SECOND, Integer.parseInt(value));
       }
     }
Index: core/src/main/java/com/xpn/xwiki/doc/XWikiDocument.java
===================================================================
--- core/src/main/java/com/xpn/xwiki/doc/XWikiDocument.java	(revision 2081)
+++ core/src/main/java/com/xpn/xwiki/doc/XWikiDocument.java	(working copy)
@@ -39,6 +39,7 @@
 import com.xpn.xwiki.store.XWikiStoreInterface;
 import com.xpn.xwiki.store.XWikiVersioningStoreInterface;
 import com.xpn.xwiki.util.Util;
+import com.xpn.xwiki.util.RegexIterator;
 import com.xpn.xwiki.validation.XWikiValidationInterface;
 import com.xpn.xwiki.validation.XWikiValidationStatus;
 import com.xpn.xwiki.web.EditForm;
@@ -49,7 +50,6 @@
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.ecs.filter.CharacterFilter;
-import org.apache.oro.text.regex.MalformedPatternException;
 import org.apache.velocity.VelocityContext;
 import org.apache.velocity.app.tools.VelocityFormatter;
 import org.dom4j.Document;
@@ -85,8 +85,6 @@
 import java.util.Map;
 import java.util.Set;
 import java.util.Vector;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipOutputStream;
 
@@ -2405,7 +2403,7 @@
         try {
             String pattern =
                 "#include(Topic|InContext|Form|Macros|parseGroovyFromPage)\\([\"'](.*?)[\"']\\)";
-            List list = context.getUtil().getMatches(getContent(), pattern, 2);
+            List list = Util.getMatches(getContent(), pattern, 2);
             for (int i = 0; i < list.size(); i++) {
                 try {
                     String name = (String) list.get(i);
@@ -2437,7 +2435,7 @@
         try {
             String pattern = "\\[(.*?)\\]";
             List newlist = new ArrayList();
-            List list = context.getUtil().getMatches(getContent(), pattern, 1);
+            List list = Util.getMatches(getContent(), pattern, 1);
             for (int i = 0; i < list.size(); i++) {
                 try {
                     String name = (String) list.get(i);
@@ -3483,15 +3481,8 @@
 
         String htmlregexp =
             "</?(html|body|img|a|i|b|embed|script|form|input|textarea|object|font|li|ul|ol|table|center|hr|br|p) ?([^>]*)>";
-        try {
-            Util util = new Util();
-            List list = util.getMatches(content2, htmlregexp, 1);
-            if (list.size() > 0) {
-                return true;
-            }
-        } catch (MalformedPatternException e) {
-        }
-        return false;
+        RegexIterator re = new RegexIterator(htmlregexp, content2);
+        return re.nextMatch();        
     }
 
     public boolean isProgrammaticContent()
@@ -3555,21 +3546,21 @@
     public List getSplitSectionsAccordingToTitle() throws XWikiException
     {
         // pattern to match the title
-        Pattern pattern =
-            Pattern.compile("^[\\p{Space}]*(1(\\.1)*)[\\p{Space}]+(.*?)$", Pattern.MULTILINE);
-        Matcher matcher = pattern.matcher(getContent());
+        final String pattern = "(?m)^[\\s]*(1(\\.1)*)[\\s]+(.*?)$";
+        String contentTemp = getContent();
+        RegexIterator re = new RegexIterator(pattern, contentTemp);
+        
         List splitSections = new ArrayList();
         int sectionNumber = 0;
-        String contentTemp = getContent();
         int beforeIndex = 0;
-        while (matcher.find()) {  // find title to split
-            String sectionLevel = matcher.group(1);
+        while (re.nextMatch()) {  // find title to split
+            String sectionLevel = re.group(1);
             if (sectionLevel.equals("1") || sectionLevel.equals("1.1")) {
                 // only set editting for the title that is 1 or 1.1
                 sectionNumber++;
-                String sectionTitle = matcher.group(3);
-                int sectionIndex = contentTemp.indexOf(matcher.group(0), beforeIndex);
-                beforeIndex = sectionIndex + matcher.group(0).length();
+                String sectionTitle = re.group(3);
+                int sectionIndex = contentTemp.indexOf(re.group(0), beforeIndex);
+                beforeIndex = sectionIndex + re.group(0).length();
                 // initialize a documentSection object
                 DocumentSection docSection =
                     new DocumentSection(sectionNumber, sectionIndex, sectionLevel, sectionTitle);
Index: core/src/main/java/com/xpn/xwiki/XWiki.java
===================================================================
--- core/src/main/java/com/xpn/xwiki/XWiki.java	(revision 2081)
+++ core/src/main/java/com/xpn/xwiki/XWiki.java	(working copy)
@@ -4398,7 +4398,7 @@
     {
         try {
             String pattern = "#includeMacros\\(\"(.*?)\"\\)";
-            List list = context.getUtil().getMatches(content, pattern, 1);
+            List list = Util.getMatches(content, pattern, 1);
             for (int i = 0; i < list.size(); i++) {
                 try {
                     String name = (String) list.get(i);
Index: core/src/main/java/com/xpn/xwiki/util/RegexIterator.java
===================================================================
--- core/src/main/java/com/xpn/xwiki/util/RegexIterator.java	(revision 0)
+++ core/src/main/java/com/xpn/xwiki/util/RegexIterator.java	(revision 0)
@@ -0,0 +1,53 @@
+package com.xpn.xwiki.util;
+
+import org.apache.oro.text.regex.MatchResult;
+import org.apache.oro.text.regex.Pattern;
+import org.apache.oro.text.regex.PatternMatcher;
+import org.apache.oro.text.regex.PatternMatcherInput;
+
+/**
+ * This simple class iterate over the matches of a regular expression. 
+ * The aim is to encapsulate the underlying regex engine, so it can be changed
+ * more easily.
+ */
+
+public class RegexIterator
+{
+    private final Pattern pattern;
+    private final PatternMatcher matcher;
+    private final PatternMatcherInput input;
+    private MatchResult match;
+    
+    public RegexIterator(String spattern, String sinput)
+    {
+        pattern = Util.getPatterns().getPattern(spattern);
+        matcher = new Util().getMatcher();
+        input = new PatternMatcherInput(sinput);
+    }
+
+    public boolean nextMatch()
+    {
+        if (matcher.contains(input, pattern))
+        {
+            match = matcher.getMatch();
+            return true;
+        }
+        else 
+            return false;
+    }
+
+    public String group(int g)
+    {
+        return match.group(g);
+    }
+    
+    public int begin(int g)
+    {
+        return match.beginOffset(g);
+    }
+    
+    public int end (int g)
+    {
+        return match.endOffset(g);
+    }
+}
Index: core/src/main/java/com/xpn/xwiki/util/TOCGenerator.java
===================================================================
--- core/src/main/java/com/xpn/xwiki/util/TOCGenerator.java	(revision 2090)
+++ core/src/main/java/com/xpn/xwiki/util/TOCGenerator.java	(working copy)
@@ -28,10 +28,7 @@
 import org.apache.commons.collections.map.ListOrderedMap;
 
 import java.util.*;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 
-
 public class TOCGenerator {
   public static final String TOC_DATA_NUMBERING = "numbering";
   public static final String TOC_DATA_LEVEL = "level";
@@ -43,11 +40,12 @@
     List processedHeadings = new ArrayList();
     int previousNumbers[] = { 0, 0, 0, 0, 0, 0, 0 };
 
-    Pattern pattern = Pattern.compile("^[\\p{Space}]*(1(\\.1)*)[\\p{Space}]+(.*?)$", Pattern.MULTILINE);
-    Matcher matcher = pattern.matcher(content);
-    while (matcher.find()) {
-      int level = (matcher.group(1).lastIndexOf("1") + 2) / 2;
-      String text = matcher.group(3);
+    final String pattern = "(?m)^[\\s]*(1(\\.1)*)[\\s]+(.*?)$";
+    RegexIterator re = new RegexIterator (pattern, content);
+
+    while (re.nextMatch()) {
+      int level = (re.group(1).lastIndexOf("1") + 2) / 2;
+      String text = re.group(3);
       
       int occurence = 0;
       for (Iterator iter = processedHeadings.iterator(); iter.hasNext();) if (iter.next().equals(text)) occurence++;
Index: core/src/main/java/com/xpn/xwiki/util/Util.java
===================================================================
--- core/src/main/java/com/xpn/xwiki/util/Util.java	(revision 2081)
+++ core/src/main/java/com/xpn/xwiki/util/Util.java	(working copy)
@@ -43,7 +43,6 @@
 import java.util.*;
 
 public class Util {
-
     private static PatternCache patterns = new PatternCacheLRU(200);
     private Perl5Matcher matcher = new Perl5Matcher();
     private Perl5Util p5util = new Perl5Util(getPatterns());
@@ -73,21 +72,23 @@
     public Perl5Util getP5util() {
         return p5util;
     }
-
-    public List getMatches(String content, String spattern, int group) throws MalformedPatternException {
+    
+    /**
+     * @return A List of strings corresponding to the matches of spattern over content. Only the given
+     *         group is returned. Duplicate matches are discarded.
+     * 
+     */
+    public static List getMatches(String content, String spattern, int group) throws MalformedPatternException {
         List list = new ArrayList();
-        PatternMatcherInput input = new PatternMatcherInput(content);
-        Pattern pattern = patterns.addPattern(spattern);
-        while (matcher.contains(input, pattern)) {
-            MatchResult result = matcher.getMatch();
-            String smatch = result.group(group);
+        RegexIterator re = new RegexIterator (spattern, content);
+        while (re.nextMatch()) {
+            String smatch = re.group(group);
             if (!list.contains(smatch))
                 list.add(smatch);
         }
         return list;
     }
 
-
     public static String cleanValue(String value) {
         value = StringUtils.replace(value,"\r\r\n", "%_N_%");
         value = StringUtils.replace(value,"\r\n", "%_N_%");


More information about the devs mailing list