Hi Asiri,
I think I'd really prefer one filter per class. Same as what is done
in the HTML cleaner. Also please donc use any *Utils class and no
static please (these are both anti patterns).
Thanks
-Vincent
On Oct 28, 2008, at 2:54 PM, asiri (SVN) wrote:
Author: asiri
Date: 2008-10-28 14:54:04 +0100 (Tue, 28 Oct 2008)
New Revision: 13868
Removed:
sandbox/xwiki-plugin-officeimporter/src/main/java/com/xpn/xwiki/
plugin/officeimporter/filter/
Modified:
sandbox/xwiki-plugin-officeimporter/src/main/java/com/xpn/xwiki/
plugin/officeimporter/OfficeImporterPlugin.java
sandbox/xwiki-plugin-officeimporter/src/main/java/com/xpn/xwiki/
plugin/officeimporter/utils/HtmlFilterUtils.java
sandbox/xwiki-plugin-officeimporter/src/test/java/com/xpn/xwiki/
plugin/officeconverter/CleanHTMLTest.java
Log:
Moved all html filter code into a single utility class called
HtmlFilterUtils. I thought of introducing some sort of a filter
chain (may be chain of responsibility pattern) but it seemed like an
over-kill for this scenario.
Modified: sandbox/xwiki-plugin-officeimporter/src/main/java/com/xpn/
xwiki/plugin/officeimporter/OfficeImporterPlugin.java
===================================================================
--- sandbox/xwiki-plugin-officeimporter/src/main/java/com/xpn/xwiki/
plugin/officeimporter/OfficeImporterPlugin.java 2008-10-28 11:33:41
UTC (rev 13867)
+++ sandbox/xwiki-plugin-officeimporter/src/main/java/com/xpn/xwiki/
plugin/officeimporter/OfficeImporterPlugin.java 2008-10-28 13:54:04
UTC (rev 13868)
@@ -57,14 +57,9 @@
import com.xpn.xwiki.doc.XWikiDocument;
import com.xpn.xwiki.plugin.XWikiDefaultPlugin;
import com.xpn.xwiki.plugin.XWikiPluginInterface;
-import com.xpn.xwiki.plugin.officeimporter.filter.EmptyLinkFilter;
-import com.xpn.xwiki.plugin.officeimporter.filter.ImageTagFilter;
-import com.xpn.xwiki.plugin.officeimporter.filter.PinLiFilter;
-import com.xpn.xwiki.plugin.officeimporter.filter.TagRemoveFilter;
-import
com.xpn.xwiki.plugin.officeimporter.filter.UnderlineLinkFilter;
-import
com.xpn.xwiki.plugin.officeimporter.filter.XWikiSyntaxEscapeFilter;
-import com.xpn.xwiki.plugin.officeimporter.utils.ImporterException;
import com.xpn.xwiki.plugin.officeimporter.utils.DocumentType;
+import com.xpn.xwiki.plugin.officeimporter.utils.HtmlFilterUtils;
+import com.xpn.xwiki.plugin.officeimporter.utils.ImporterException;
import com.xpn.xwiki.web.Utils;
/**
@@ -471,9 +466,7 @@
HTMLCleaner.ROLE), e);
}
Document document = htmlCleaner.clean(new
StringReader(inputHTML));
-
- new UnderlineLinkFilter().filter(document);
-
+ HtmlFilterUtils.filterUnderlinedLinks(document);
XMLUtils.stripHTMLEnvelope(document);
String cleanedHTML = XMLUtils.toString(document);
return cleanedHTML;
@@ -499,14 +492,12 @@
HTMLCleaner.ROLE), e);
}
Document document = htmlCleaner.clean(new
StringReader(inputHTML));
-
- new TagRemoveFilter().filter(document);
- new UnderlineLinkFilter().filter(document);
- new XWikiSyntaxEscapeFilter().filter(document);
- new ImageTagFilter().filter(document);
- new PinLiFilter().filter(document);
- new EmptyLinkFilter().filter(document);
-
+ HtmlFilterUtils.filterTags(document, new String[]{"style",
"script"});
+ HtmlFilterUtils.filterUnderlinedLinks(document);
+ HtmlFilterUtils.filterSytaxChars(document);
+ HtmlFilterUtils.filterImageLinks(document);
+ HtmlFilterUtils.filterParagraphTagsInLineItemTags(document);
+ HtmlFilterUtils.filterEmptyLinks(document);
XMLUtils.stripHTMLEnvelope(document);
String cleanedHTML = XMLUtils.toString(document);
return cleanedHTML;
Modified: sandbox/xwiki-plugin-officeimporter/src/main/java/com/xpn/
xwiki/plugin/officeimporter/utils/HtmlFilterUtils.java
===================================================================
--- sandbox/xwiki-plugin-officeimporter/src/main/java/com/xpn/xwiki/
plugin/officeimporter/utils/HtmlFilterUtils.java 2008-10-28 11:33:41
UTC (rev 13867)
+++ sandbox/xwiki-plugin-officeimporter/src/main/java/com/xpn/xwiki/
plugin/officeimporter/utils/HtmlFilterUtils.java 2008-10-28 13:54:04
UTC (rev 13868)
@@ -1,12 +1,247 @@
package com.xpn.xwiki.plugin.officeimporter.utils;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.w3c.dom.Text;
+
/**
* A utility class containing a suite of filter methods used to
manipulate Html documents.
*
* @version $Id$
* @since 1.7M1
*/
-public class HtmlFilterUtils
+public abstract class HtmlFilterUtils
{
+ /**
+ * Characters that need to be escaped when jumping from html to
xwiki syntax.
+ */
+ private static final List<String> escapeChars = new
ArrayList<String>();
+ /**
+ * Static initializer for escape chars.
+ */
+ static {
+ escapeChars.add("[");
+ escapeChars.add("]");
+ escapeChars.add("{");
+ escapeChars.add("}");
+ escapeChars.add("*");
+ escapeChars.add("~");
+ escapeChars.add("_");
+ escapeChars.add("-");
+ escapeChars.add("1");
+ escapeChars.add("#");
+ escapeChars.add("$");
+ }
+
+ /**
+ * Removes empty links from html documents. If the label of the
link is empty, simply remove the
+ * tag as in {@code <a/>} or {@code <a href=""/>}. If the
label
is not null but the href
+ * attribute is missing, replace the tag with it's label. Like
changing {@code <a>something</a>}
+ * to {@code something}.
+ *
+ * @param document The html document.
+ */
+ public static void filterEmptyLinks(Document document)
+ {
+ Element root = document.getDocumentElement();
+ NodeList links = root.getElementsByTagName("a");
+ for (int i = 0; i < links.getLength(); i++) {
+ Node link = links.item(i);
+ if (link.getTextContent() == null ||
link.getTextContent().trim().equals("")) {
+ link.getParentNode().removeChild(link);
+ i--;
+ continue;
+ }
+
+ Node hrefAttr =
link.getAttributes().getNamedItem("href");
+ if (hrefAttr == null ||
hrefAttr.getTextContent().trim().equals("")) {
+ NodeList children = link.getChildNodes();
+ while (children.getLength() > 0) {
+
link.getParentNode().insertBefore(children.item(0), link);
+ }
+ link.getParentNode().removeChild(link);
+ i--;
+ }
+ }
+ }
+
+ /**
+ * Replaces the {@code <img>} tags with corresponding {image}
macro elements which are
+ * recognized by xwiki syntax 1.0. Handles image attributes
like src, width, height, alt, align.
+ *
+ * @param document The html document.
+ */
+ public static void filterImageLinks(Document document)
+ {
+ Element root = document.getDocumentElement();
+ NodeList imgs = root.getElementsByTagName("img");
+ while (imgs.getLength() > 0) {
+ Node image = imgs.item(0);
+ String imageCode = generateImageMacroString(image);
+ Node parent = image.getParentNode();
+ Text newImg = document.createTextNode(imageCode);
+ parent.replaceChild(newImg, image);
+ }
+ }
+
+ /**
+ * Converts a {@code <img>} element into a xwiki syntax 1.0
{image} macro element.
+ *
+ * @param imageLink Node representing the image link.
+ * @return Converted {image} macro string.
+ */
+ private static String generateImageMacroString(Node imageLink)
+ {
+ NamedNodeMap attrs = imageLink.getAttributes();
+ if (attrs == null) {
+ return null;
+ }
+ StringBuffer sb = new StringBuffer();
+ sb.append("{image:");
+ if (attrs.getNamedItem("src") != null) {
+ String src = attrs.getNamedItem("src").getTextContent();
+ sb.append(src);
+ }
+ if (attrs.getNamedItem("width") != null) {
+ String width =
attrs.getNamedItem("width").getTextContent();
+ sb.append("|width=" + width);
+ }
+ if (attrs.getNamedItem("height") != null) {
+ String height =
attrs.getNamedItem("height").getTextContent();
+ sb.append("|height=" + height);
+ }
+ if (attrs.getNamedItem("alt") != null) {
+ String alt = attrs.getNamedItem("alt").getTextContent();
+ sb.append("|alt=" + alt);
+ }
+ if (attrs.getNamedItem("align") != null) {
+ String align =
attrs.getNamedItem("align").getTextContent();
+ sb.append("|align=" + align);
+ }
+ sb.append("}");
+ return sb.toString();
+ }
+
+ /**
+ * Removes the starting {@code <p>} tags found within {@code
<li>} tags. This is useful since
+ * such formations are not properly handled in xwiki 1.0 syntax.
+ *
+ * @param document The html document.
+ */
+ public static void filterParagraphTagsInLineItemTags(Document
document)
+ {
+ Element root = document.getDocumentElement();
+ NodeList lists = root.getElementsByTagName("li");
+ for (int i = 0; i < lists.getLength(); i++) {
+ Node list = lists.item(i);
+ Node firstChild = list.getFirstChild();
+ if (firstChild.getNodeName() != null &&
firstChild.getNodeName().equals("p")) {
+ NodeList childchildren = firstChild.getChildNodes();
+ while (childchildren.getLength() > 0) {
+ list.insertBefore(childchildren.item(0),
firstChild);
+ }
+ list.removeChild(firstChild);
+ }
+ }
+ }
+
+ /**
+ * Removes all listed tags from the given html document.
+ *
+ * @param document The html document.
+ * @param tags Tags to be removed.
+ */
+ public static void filterTags(Document document, String[] tags)
+ {
+ Element root = document.getDocumentElement();
+ for (String tag : tags) {
+ NodeList toBeRemovedTags =
root.getElementsByTagName(tag);
+ while (toBeRemovedTags.getLength() > 0) {
+ Node t = toBeRemovedTags.item(0);
+ t.getParentNode().removeChild(t);
+ }
+ }
+ }
+
+ /**
+ * Strips off underline tags surrounding links like {@code
<u><a href="something">link</a></u>}.
+ *
+ * @param document The html document.
+ */
+ public static void filterUnderlinedLinks(Document document)
+ {
+ Element root = document.getDocumentElement();
+ NodeList links = root.getElementsByTagName("a");
+ for (int i = 0; i < links.getLength(); i++) {
+ Node link = links.item(i);
+ Node parent = link.getParentNode();
+ String parentName = parent.getNodeName();
+ if (parentName != null && (parentName.equals("u") ||
parentName.equals("del"))) {
+ parent.getParentNode().insertBefore(link, parent);
+ parent.getParentNode().removeChild(parent);
+ }
+ }
+ }
+
+ /**
+ * Escapes the xwiki sytax characters from the given html
document. Example : {@code [} will be
+ * replaced by {@code \]}.
+ *
+ * @param document The html document.
+ */
+ public static void filterSytaxChars(Document document)
+ {
+ Element root = document.getDocumentElement();
+ escapeNode(root);
+ }
+
+ /**
+ * Escapes xwiki syntax characters within the given node's
content.
+ *
+ * @param node The node which is to be examined.
+ */
+ private static void escapeNode(Node node)
+ {
+ NodeList nodes = node.getChildNodes();
+ for (int i = 0; i < nodes.getLength(); i++) {
+ Node next = nodes.item(i);
+ if (next instanceof Text) {
+ String text = next.getTextContent();
+ text = escapeText(text);
+ next.setTextContent(text);
+ } else {
+ if (next.hasChildNodes()) {
+ escapeNode(next);
+ }
+ }
+ }
+ }
+
+ /**
+ * Escapes xwiki syntax characters within the given string.
+ *
+ * @param text The string to be examined.
+ * @return The syntax escaped string.
+ */
+ private static String escapeText(String text)
+ {
+ StringBuffer sb = new StringBuffer();
+ for (int i = 0; i < text.length(); i++) {
+ char x = text.charAt(i);
+ if (escapeChars.contains(String.valueOf(x))) {
+ sb.append("\\");
+ sb.append(String.valueOf(x));
+ } else {
+ sb.append(x);
+ }
+ }
+ return sb.toString();
+ }
}
Modified: sandbox/xwiki-plugin-officeimporter/src/test/java/com/xpn/
xwiki/plugin/officeconverter/CleanHTMLTest.java
===================================================================
--- sandbox/xwiki-plugin-officeimporter/src/test/java/com/xpn/xwiki/
plugin/officeconverter/CleanHTMLTest.java 2008-10-28 11:33:41 UTC
(rev 13867)
+++ sandbox/xwiki-plugin-officeimporter/src/test/java/com/xpn/xwiki/
plugin/officeconverter/CleanHTMLTest.java 2008-10-28 13:54:04 UTC
(rev 13868)
@@ -27,13 +27,7 @@
import org.xwiki.xml.XMLUtils;
import org.xwiki.xml.html.HTMLCleaner;
-import com.xpn.xwiki.plugin.officeimporter.filter.EmptyLinkFilter;
-import com.xpn.xwiki.plugin.officeimporter.filter.HTMLFilter;
-import com.xpn.xwiki.plugin.officeimporter.filter.ImageTagFilter;
-import com.xpn.xwiki.plugin.officeimporter.filter.PinLiFilter;
-import com.xpn.xwiki.plugin.officeimporter.filter.TagRemoveFilter;
-import
com.xpn.xwiki.plugin.officeimporter.filter.UnderlineLinkFilter;
-import
com.xpn.xwiki.plugin.officeimporter.filter.XWikiSyntaxEscapeFilter;
+import com.xpn.xwiki.plugin.officeimporter.utils.HtmlFilterUtils;
import com.xpn.xwiki.plugin.officeimporter.utils.ImporterException;
import com.xpn.xwiki.test.AbstractXWikiComponentTestCase;
@@ -121,14 +115,12 @@
private void test(String input, String expected) throws
ImporterException
{
Document document = cleaner.clean(new StringReader(input));
-
- new TagRemoveFilter().filter(document);
- new UnderlineLinkFilter().filter(document);
- new XWikiSyntaxEscapeFilter().filter(document);
- new ImageTagFilter().filter(document);
- new PinLiFilter().filter(document);
- new EmptyLinkFilter().filter(document);
-
+ HtmlFilterUtils.filterTags(document, new String[]{"style",
"script"});
+ HtmlFilterUtils.filterUnderlinedLinks(document);
+ HtmlFilterUtils.filterSytaxChars(document);
+ HtmlFilterUtils.filterImageLinks(document);
+ HtmlFilterUtils.filterParagraphTagsInLineItemTags(document);
+ HtmlFilterUtils.filterEmptyLinks(document);
XMLUtils.stripHTMLEnvelope(document);
String actual = XMLUtils.toString(document);
assertEquals(HEAD + expected + FOOT, actual);
_______________________________________________
notifications mailing list
notifications(a)xwiki.org
http://lists.xwiki.org/mailman/listinfo/notifications