r1690 - in xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin: . lucene lucene/textextraction lucene/textextraction/xmlutil
Jeremi Joslin
jeremi at users.forge.objectweb.org
Wed Dec 6 23:29:03 CET 2006
Author: jeremi
Date: 2006-12-06 23:29:02 +0100 (Wed, 06 Dec 2006)
New Revision: 1690
Added:
xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/
xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/AttachmentData.java
xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/DocumentData.java
xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/IndexData.java
xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/IndexFields.java
xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/IndexRebuilder.java
xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/IndexUpdater.java
xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/LucenePlugin.java
xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/LucenePluginApi.java
xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/ObjectData.java
xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/SearchResult.java
xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/SearchResults.java
xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/TextExtractor.java
xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/XWikiDocumentQueue.java
xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/textextraction/
xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/textextraction/MSExcelTextExtractor.java
xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/textextraction/MSPowerPointTextExtractor.java
xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/textextraction/MSWordTextExtractor.java
xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/textextraction/MimetypeTextExtractor.java
xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/textextraction/OpenOfficeTextExtractor.java
xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/textextraction/PDFTextExtractor.java
xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/textextraction/PlainTextExtractor.java
xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/textextraction/XmlTextExtractor.java
xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/textextraction/xmlutil/
xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/textextraction/xmlutil/XmlEncodingDetector.java
Log:
move the lucene plugin to the core of xwiki
Added: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/AttachmentData.java
===================================================================
--- xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/AttachmentData.java 2006-12-06 21:50:31 UTC (rev 1689)
+++ xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/AttachmentData.java 2006-12-06 22:29:02 UTC (rev 1690)
@@ -0,0 +1,179 @@
+/*
+ *
+ * ===================================================================
+ *
+ * Copyright (c) 2005 Jens Krämer, All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details, published at
+ * http://www.gnu.org/copyleft/gpl.html or in gpl.txt in the
+ * root folder of this distribution.
+ *
+ * Created on 25.01.2005
+ *
+ */
+package com.xpn.xwiki.plugin.lucene;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.log4j.Logger;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+
+import com.xpn.xwiki.XWikiContext;
+import com.xpn.xwiki.doc.XWikiAttachment;
+import com.xpn.xwiki.doc.XWikiDocument;
+
+/**
+ * Holds all data but the content of an attachment to be indexed. The content is
+ * retrieved at indexing time, which should save us some memory especially when
+ * rebuilding an index for a big wiki.
+ * @author <a href="mailto:jk at jkraemer.net">Jens Krämer </a>
+ */
+public class AttachmentData extends IndexData
+{
+ /**
+ * Mapping from common file name endings to mime types. This is uses as a
+ * fallback when text extraction by using the mime type delivered by xwiki
+ * doesn't work.
+ */
+ static final Map MIMETYPES = new HashMap ();
+ static
+ {
+ MIMETYPES.put ("pdf", "application/pdf");
+ MIMETYPES.put ("doc", "application/msword");
+ MIMETYPES.put ("sxw", "application/vnd.sun.xml.writer");
+ MIMETYPES.put ("xml", "text/xml");
+ MIMETYPES.put ("txt", "text/plain");
+ MIMETYPES.put ("ppt", "application/ms-powerpoint");
+ MIMETYPES.put ("xls", "application/ms-excel");
+ }
+
+ private static final Logger LOG = Logger.getLogger (AttachmentData.class);
+ private int size;
+ private String filename;
+
+ /**
+ * @param attachment
+ * @param context
+ */
+ public AttachmentData (final XWikiDocument document, final XWikiAttachment attachment,
+ final XWikiContext context)
+ {
+ super (attachment.getDoc (), context);
+ setModificationDate (attachment.getDate ());
+ setAuthor (attachment.getAuthor ());
+ setSize (attachment.getFilesize ());
+ setFilename (attachment.getFilename ());
+ }
+
+ /**
+ * @see net.jkraemer.xwiki.plugins.lucene.IndexData#addDataToLuceneDocument(org.apache.lucene.document.Document)
+ */
+ public void addDataToLuceneDocument (Document luceneDoc, XWikiDocument doc, XWikiContext context)
+ {
+ super.addDataToLuceneDocument (luceneDoc, doc, context);
+ if (filename != null) luceneDoc.add (new Field (IndexFields.FILENAME, filename, Field.Store.YES, Field.Index.TOKENIZED));
+ }
+
+ /**
+ * @param size
+ * The size to set.
+ */
+ public void setSize (int size)
+ {
+ this.size = size;
+ }
+
+ /**
+ * @see net.jkraemer.xwiki.plugins.lucene.IndexData#getType()
+ */
+ public String getType ()
+ {
+ return LucenePlugin.DOCTYPE_ATTACHMENT;
+ }
+
+ /**
+ * @return Returns the filename.
+ */
+ public String getFilename ()
+ {
+ return filename;
+ }
+
+ /**
+ * @param filename
+ * The filename to set.
+ */
+ public void setFilename (String filename)
+ {
+ this.filename = filename;
+ }
+
+ /**
+ * overridden to append the filename
+ * @see net.jkraemer.xwiki.plugins.lucene.IndexData#getId()
+ */
+ public String getId ()
+ {
+ return new StringBuffer (super.getId ()).append (".").append (filename).toString ();
+ }
+
+ /**
+ * @return a string containing the result of
+ * {@link IndexData#getFullText(XWikiDocument, XWikiContext)}plus
+ * the full text content of this attachment, as far as it could be
+ * extracted.
+ */
+ public String getFullText (XWikiDocument doc, XWikiContext context)
+ {
+ StringBuffer retval = new StringBuffer (super.getFullText (doc, context));
+ String contentText = null;
+ contentText = getContentAsText (doc, context);
+ if (contentText != null)
+ {
+ retval.append (" ").append (contentText).toString ();
+ }
+ return retval.toString ();
+ }
+
+ /**
+ * @param doc
+ * @param context
+ * @param contentText
+ * @return
+ */
+ private String getContentAsText (XWikiDocument doc, XWikiContext context)
+ {
+ String contentText = null;
+ try
+ {
+ XWikiAttachment att = doc.getAttachment (filename);
+ if (LOG.isDebugEnabled ()) LOG.debug ("have attachment for filename " + filename + ": " + att);
+ byte[] content = att.getContent (context);
+ if (filename != null)
+ {
+ String[] nameParts = filename.split ("\\.");
+ if (nameParts.length > 1)
+ {
+ contentText = TextExtractor.getText (content, (String) MIMETYPES
+ .get (nameParts[nameParts.length - 1].toLowerCase ()));
+ }
+ }
+ } catch (Exception e)
+ {
+ LOG.error ("error getting content of attachment", e);
+ e.printStackTrace ();
+ }
+ return contentText;
+ }
+
+}
Property changes on: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/AttachmentData.java
___________________________________________________________________
Name: svn:eol-style
+ native
Added: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/DocumentData.java
===================================================================
--- xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/DocumentData.java 2006-12-06 21:50:31 UTC (rev 1689)
+++ xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/DocumentData.java 2006-12-06 22:29:02 UTC (rev 1690)
@@ -0,0 +1,70 @@
+/*
+ *
+ * ===================================================================
+ *
+ * Copyright (c) 2005 Jens Krämer, All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details, published at
+ * http://www.gnu.org/copyleft/gpl.html or in gpl.txt in the
+ * root folder of this distribution.
+ *
+ * Created on 25.01.2005
+ *
+ */
+
+package com.xpn.xwiki.plugin.lucene;
+
+import org.apache.log4j.Logger;
+
+import com.xpn.xwiki.XWikiContext;
+import com.xpn.xwiki.XWikiException;
+import com.xpn.xwiki.doc.XWikiDocument;
+
+/**
+ * Holds all data but the content of a wiki page to be indexed. The content is
+ * retrieved at indexing time, which should save us some memory especially when
+ * rebuilding an index for a big wiki.
+ * @author <a href="mailto:jk at jkraemer.net">Jens Krämer </a>
+ */
+public class DocumentData extends IndexData
+{
+ private static final Logger LOG = Logger.getLogger (DocumentData.class);
+
+ public DocumentData (final XWikiDocument doc, final XWikiContext context)
+ {
+ super (doc, context);
+ setAuthor (doc.getAuthor ());
+ setCreator (doc.getCreator ());
+ setModificationDate (doc.getDate ());
+ setCreationDate (doc.getCreationDate ());
+ }
+
+ /**
+ * @see net.jkraemer.xwiki.plugins.lucene.IndexData#getType()
+ */
+ public String getType ()
+ {
+ return LucenePlugin.DOCTYPE_WIKIPAGE;
+ }
+
+ /**
+ * @return a string containing the result of
+ * {@link IndexData#getFullText(XWikiDocument, XWikiContext, String)}
+ * plus the full text content of this document (in the given
+ * language)
+ */
+ public String getFullText (XWikiDocument doc, XWikiContext context)
+ {
+ return new StringBuffer (super.getFullText (doc, context)).append (" ").append (doc.getContent ())
+ .toString ();
+ }
+
+}
Property changes on: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/DocumentData.java
___________________________________________________________________
Name: svn:eol-style
+ native
Added: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/IndexData.java
===================================================================
--- xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/IndexData.java 2006-12-06 21:50:31 UTC (rev 1689)
+++ xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/IndexData.java 2006-12-06 22:29:02 UTC (rev 1690)
@@ -0,0 +1,273 @@
+/*
+ *
+ * ===================================================================
+ *
+ * Copyright (c) 2005 Jens Krämer, All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details, published at
+ * http://www.gnu.org/copyleft/gpl.html or in gpl.txt in the
+ * root folder of this distribution.
+ *
+ * Created on 25.01.2005
+ *
+ */
+
+package com.xpn.xwiki.plugin.lucene;
+
+import java.util.Date;
+
+import org.apache.commons.lang.time.FastDateFormat;
+import org.apache.log4j.Logger;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+
+import com.xpn.xwiki.XWikiContext;
+import com.xpn.xwiki.doc.XWikiDocument;
+
+/**
+ * @author <a href="mailto:jk at jkraemer.net">Jens Krämer </a>
+ */
+public abstract class IndexData
+{
+ private static final Logger LOG = Logger.getLogger (IndexData.class);
+
+ private String documentName;
+ private String documentWeb;
+ private String fullName;
+ private String author;
+ private Date creationDate;
+ private String creator;
+ private String language;
+ private Date modificationDate;
+ /** name of the virtual wiki this doc belongs to */
+ private String wiki;
+
+ public IndexData (final XWikiDocument doc, final XWikiContext context)
+ {
+ setDocumentName (doc.getName ());
+ setDocumentWeb (doc.getWeb ());
+ setWiki (context.getDatabase ());
+ setFullName (new StringBuffer (wiki).append (":").append (documentWeb).append (".")
+ .append (documentName).toString ());
+ setLanguage (doc.getLanguage ());
+ }
+
+ /**
+ * Adds this documents data to a lucene Document instance for indexing.
+ * <p>
+ * <strong>Short introduction to Lucene field types </strong>
+ * </p>
+ * <p>
+ * Which type of Lucene field is used determines what Lucene does with data
+ * and how we can use it for searching and showing search results:
+ * </p>
+ * <ul>
+ * <li>Keyword fields don't get tokenized, but are searchable and stored in
+ * the index. This is perfect for fields you want to search in
+ * programmatically (like ids and such), and date fields. Since all
+ * user-entered queries are tokenized, letting the user search these fields
+ * makes almost no sense, except of queries for date fields, where
+ * tokenization is useless.</li>
+ * <li>the stored text fields are used for short texts which should be
+ * searchable by the user, and stored in the index for reconstruction.
+ * Perfect for document names, titles, abstracts.</li>
+ * <li>the unstored field takes the biggest part of the content - the full
+ * text. It is tokenized and indexed, but not stored in the index. This
+ * makes sense, since when the user wants to see the full content, he clicks
+ * the link to vie the full version of a document, which is then delivered
+ * by xwiki.</li>
+ * </ul>
+ * @param luceneDoc
+ * if not null, this controls which translated version of the
+ * content will be indexed. If null, the content in the default
+ * language will be used.
+ */
+ public void addDataToLuceneDocument (org.apache.lucene.document.Document luceneDoc, XWikiDocument doc,
+ XWikiContext context)
+ {
+ // Keyword fields: stored and indexed, but not tokenized
+ luceneDoc.add (new Field(IndexFields.DOCUMENT_ID, getId(), Field.Store.YES, Field.Index.TOKENIZED));
+ luceneDoc.add (new Field(IndexFields.DOCUMENT_LANGUAGE, this.language, Field.Store.YES, Field.Index.TOKENIZED));
+ if (wiki != null && wiki.length () > 0)
+ luceneDoc.add (new Field (IndexFields.DOCUMENT_WIKI, wiki, Field.Store.YES, Field.Index.TOKENIZED));
+ if (getType () != null) luceneDoc.add (new Field (IndexFields.DOCUMENT_TYPE, getType (), Field.Store.YES, Field.Index.TOKENIZED));
+ if (modificationDate != null)
+ luceneDoc.add (new Field(IndexFields.DOCUMENT_DATE, IndexFields
+ .dateToString (modificationDate), Field.Store.YES, Field.Index.NO));
+ if (creationDate != null)
+ luceneDoc.add (new Field(IndexFields.DOCUMENT_CREATIONDATE, IndexFields
+ .dateToString (creationDate), Field.Store.YES, Field.Index.NO));
+
+ // stored Text fields: tokenized and indexed
+ luceneDoc.add (new Field(IndexFields.DOCUMENT_NAME, documentName, Field.Store.YES, Field.Index.TOKENIZED));
+ luceneDoc.add (new Field(IndexFields.DOCUMENT_WEB, documentWeb, Field.Store.YES, Field.Index.TOKENIZED));
+ if (author != null) luceneDoc.add (new Field(IndexFields.DOCUMENT_AUTHOR, author, Field.Store.YES, Field.Index.TOKENIZED));
+ if (creator != null) luceneDoc.add (new Field(IndexFields.DOCUMENT_CREATOR, creator, Field.Store.YES, Field.Index.TOKENIZED));
+
+ // UnStored fields: tokenized and indexed, but no reconstruction of
+ // original content will be possible from the search result
+ try
+ {
+ final String ft = getFullText (doc, context);
+ if (ft != null) luceneDoc.add (new Field(IndexFields.FULLTEXT, ft, Field.Store.NO, Field.Index.TOKENIZED));
+ } catch (Exception e)
+ {
+ LOG.error ("error extracting fulltext for document " + this, e);
+ }
+ }
+
+ /**
+ * Builds a Lucene query matching only the document this instance
+ * represents. This is used for removing old versions of a document from the
+ * index before adding a new one.
+ * @return a query matching the field DOCUMENT_ID to the value of #getId()
+ */
+ public Query buildQuery ()
+ {
+ return new TermQuery (new Term (IndexFields.DOCUMENT_ID, getId ()));
+ }
+
+ /**
+ * @return string unique to this document across all languages and virtual
+ * wikis
+ */
+ public String getId ()
+ {
+ StringBuffer retval = new StringBuffer ();
+ if (wiki != null && wiki.length () > 0) retval.append (wiki).append (":");
+ retval.append (documentWeb).append (".");
+ retval.append (documentName).append (".");
+ retval.append (language);
+ return retval.toString ();
+ }
+
+ /**
+ * @return String of documentName, documentWeb, author and creator
+ */
+ public String getFullText (XWikiDocument doc, XWikiContext context)
+ {
+ StringBuffer sb = new StringBuffer (documentName).append (" ").append (documentWeb).append (" ")
+ .append (author).append (creator);
+ return sb.toString ();
+ }
+
+ public abstract String getType ();
+
+ public String toString ()
+ {
+ return getId ();
+ }
+
+ /**
+ * @param author
+ * The author to set.
+ */
+ public void setAuthor (String author)
+ {
+ this.author = author;
+ }
+
+ /**
+ * @param documentName
+ * The documentName to set.
+ */
+ public void setDocumentName (String documentName)
+ {
+ this.documentName = documentName;
+ }
+
+ /**
+ * @param documentWeb
+ * The documentWeb to set.
+ */
+ public void setDocumentWeb (String documentWeb)
+ {
+ this.documentWeb = documentWeb;
+ }
+
+ /**
+ * @param modificationDate
+ * The modificationDate to set.
+ */
+ public void setModificationDate (Date modificationDate)
+ {
+ this.modificationDate = modificationDate;
+ }
+
+ public String getDocumentName ()
+ {
+ return documentName;
+ }
+
+ public String getDocumentWeb ()
+ {
+ return documentWeb;
+ }
+
+ public String getWiki ()
+ {
+ return wiki;
+ }
+
+ public void setWiki (String wiki)
+ {
+ this.wiki = wiki;
+ }
+
+ public Date getCreationDate ()
+ {
+ return creationDate;
+ }
+
+ public void setCreationDate (Date creationDate)
+ {
+ this.creationDate = creationDate;
+ }
+
+ public String getCreator ()
+ {
+ return creator;
+ }
+
+ public void setCreator (String creator)
+ {
+ this.creator = creator;
+ }
+
+ /**
+ * @return
+ */
+ public String getFullName ()
+ {
+ return fullName;
+ }
+
+ public void setFullName (String fullName)
+ {
+ this.fullName = fullName;
+ }
+
+ public String getLanguage ()
+ {
+ return language;
+ }
+
+ public void setLanguage (String lang)
+ {
+ if (lang != null && lang.length () > 0)
+ this.language = lang;
+ else
+ this.language = "default";
+ }
+}
Property changes on: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/IndexData.java
___________________________________________________________________
Name: svn:eol-style
+ native
Added: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/IndexFields.java
===================================================================
--- xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/IndexFields.java 2006-12-06 21:50:31 UTC (rev 1689)
+++ xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/IndexFields.java 2006-12-06 22:29:02 UTC (rev 1690)
@@ -0,0 +1,112 @@
+/*
+ *
+ * ===================================================================
+ *
+ * Copyright (c) 2005 Jens Krämer, All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details, published at
+ * http://www.gnu.org/copyleft/gpl.html or in gpl.txt in the
+ * root folder of this distribution.
+ *
+ * Created on 28.01.2005
+ *
+ */
+package com.xpn.xwiki.plugin.lucene;
+
+import java.text.SimpleDateFormat;
+import java.util.Date;
+
+import org.apache.commons.lang.time.FastDateFormat;
+import org.apache.log4j.Logger;
+
+/**
+ * Contains constants naming the Lucene index fields used by this Plugin and
+ * some helper methods for proper handling of special field values like dates.
+ * @author <a href="mailto:jk at jkraemer.net">Jens Krämer </a>
+ */
+public abstract class IndexFields
+{
+
+ /**
+ * Keyword field, holds a string uniquely identifying a document across the
+ * index. this is used for finding old versions of a document to be indexed.
+ */
+ public static final String DOCUMENT_ID = "_docid";
+ /** Keyword field, holds the name of the virtual wiki a document belongs to */
+ public static final String DOCUMENT_WIKI = "wiki";
+ /** Name of the document */
+ public static final String DOCUMENT_NAME = "name";
+ /** Name of the web the document belongs to */
+ public static final String DOCUMENT_WEB = "web";
+ /** Language of the document */
+ public static final String DOCUMENT_LANGUAGE = "lang";
+ /**
+ * Type of a document, "attachment" or "wikipage", used to control
+ * presentation of searchresults. See {@link SearchResult}and
+ * xdocs/searchResult.vm.
+ */
+ public static final String DOCUMENT_TYPE = "type";
+
+ /** Filename, only used for attachments */
+ public static final String FILENAME = "filename";
+ /** Last modifier */
+ public static final String DOCUMENT_AUTHOR = "author";
+ /** Creator of the document */
+ public static final String DOCUMENT_CREATOR = "creator";
+ /** Date of last modification */
+ public static final String DOCUMENT_DATE = "date";
+ /** Date of creation */
+ public static final String DOCUMENT_CREATIONDATE = "creationdate";
+ /**
+ * Fulltext content, not stored (and can therefore not be restored from the
+ * index).
+ */
+ public static final String FULLTEXT = "ft";
+ /** not in use */
+ public static final String KEYWORDS = "kw";
+ /**
+ * Format for date storage in the index, and therefore the format which has
+ * to be used for date-queries.
+ */
+ public static final String DATE_FORMAT = "yyyyMMddHHmm";
+
+ private static final FastDateFormat df = FastDateFormat
+ .getInstance (IndexFields.DATE_FORMAT);
+ private static final Logger LOG = Logger.getLogger (IndexFields.class);
+
+ public static final String dateToString (Date date)
+ {
+ return df.format (date);
+ }
+
+ public static final Date stringToDate (String dateValue)
+ {
+ SimpleDateFormat sdf = new SimpleDateFormat (DATE_FORMAT);
+ try
+ {
+ return sdf.parse (dateValue);
+ } catch (Exception e)
+ {
+ // silently ignore
+ }
+ return null;
+ }
+
+ /**
+ *
+ */
+ private IndexFields ()
+ {
+ super ();
+ // TODO Auto-generated constructor stub
+ }
+
+}
Property changes on: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/IndexFields.java
___________________________________________________________________
Name: svn:eol-style
+ native
Added: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/IndexRebuilder.java
===================================================================
--- xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/IndexRebuilder.java 2006-12-06 21:50:31 UTC (rev 1689)
+++ xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/IndexRebuilder.java 2006-12-06 22:29:02 UTC (rev 1690)
@@ -0,0 +1,231 @@
+/*
+ *
+ * ===================================================================
+ *
+ * Copyright (c) 2005 Jens Krämer, All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details, published at
+ * http://www.gnu.org/copyleft/gpl.html or in gpl.txt in the
+ * root folder of this distribution.
+ *
+ * Created on 01.02.2005
+ *
+ */
+package com.xpn.xwiki.plugin.lucene;
+
+import java.util.*;
+
+import org.apache.log4j.Logger;
+
+import com.xpn.xwiki.XWikiContext;
+import com.xpn.xwiki.XWikiException;
+import com.xpn.xwiki.objects.BaseObject;
+import com.xpn.xwiki.api.XWiki;
+import com.xpn.xwiki.doc.XWikiAttachment;
+import com.xpn.xwiki.doc.XWikiDocument;
+
+/**
+ * Handles rebuilding of the whole Index. This involves the following steps:
+ * <ul>
+ * <li>empty the existing index</li>
+ * <li>retrieve the names of all virtual wikis</li>
+ * <li>get and index all documents for each virtual wiki</li>
+ * <li>get and index all translations of each document</li>
+ * <li>get and index all attachments of each document</li>
+ * </ul>
+ * The indexing of all content fetched from the wiki is triggered by handing the
+ * data to the indexUpdater thread.
+ *
+ * @author <a href="mailto:jk at jkraemer.net">Jens Krämer </a>
+ */
+public class IndexRebuilder {
+ private IndexUpdater indexUpdater;
+ private static final Logger LOG = Logger.getLogger(IndexRebuilder.class);
+
+ /**
+ * First empties the index, then fetches all Documents, their translations
+ * and their attachments for re-addition to the index.
+ *
+ * @param wiki
+ * @param context
+ * @return total number of documentes and attachments successfully added to
+ * the indexer queue, -1 when errors occured.
+ * @throws XWikiException
+ * @todo TODO: give more detailed results
+ */
+ public int rebuildIndex(com.xpn.xwiki.api.XWiki wiki, XWikiContext context) {
+ indexUpdater.cleanIndex();
+ int retval = 0;
+ Collection wikiServers;
+ com.xpn.xwiki.XWiki xwiki = context.getWiki();
+ if (wiki.isVirtual()) {
+ wikiServers = findWikiServers(wiki, context);
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("found " + wikiServers.size() + " virtual wikis:");
+ for (Iterator iter = wikiServers.iterator(); iter.hasNext();) {
+ LOG.debug(iter.next());
+ }
+ }
+ } else {
+ // no virtual wiki configuration, just index the wiki the context
+ // belongs to
+ wikiServers = new ArrayList();
+ ((ArrayList) wikiServers).add(context.getDatabase());
+ }
+ // Iterate all found virtual wikis
+ for (Iterator iter = wikiServers.iterator(); iter.hasNext();) {
+ int wikiResult = indexWiki(xwiki, (String) iter.next());
+ if (retval != -1) retval += wikiResult;
+ }
+ return retval;
+ }
+
+ /**
+ * Adds the content of a given wiki to the indexUpdater's queue.
+ *
+ * @param xwiki
+ * @param context
+ * @param wikiName
+ * @return
+ */
+ protected int indexWiki(com.xpn.xwiki.XWiki xwiki, String wikiName) {
+ LOG.info("reading content of wiki " + wikiName);
+ int retval = 0;
+ XWikiContext wikiContext = new XWikiContext();
+ wikiContext.setWiki(xwiki);
+ wikiContext.setDatabase(wikiName);
+ Collection docNames = null;
+ try {
+ docNames = xwiki.getStore().searchDocumentsNames("", wikiContext);
+ } catch (XWikiException e1) {
+ LOG.error("error getting document names for wiki " + wikiName);
+ e1.printStackTrace();
+ return -1;
+ }
+ for (Iterator iterator = docNames.iterator(); iterator.hasNext();) {
+ String docName = (String) iterator.next();
+ XWikiDocument document;
+ try {
+ document = xwiki.getDocument(docName, wikiContext);
+ } catch (XWikiException e2) {
+ LOG.error("error fetching document " + wikiName + ":" + docName);
+ e2.printStackTrace();
+ continue;
+ }
+ if (document != null) {
+ indexUpdater.add(document, wikiContext);
+ retval++;
+ retval += addTranslationsOfDocument(document, wikiContext);
+ retval += addAttachmentsOfDocument(document, wikiContext);
+ retval += addObjectsOfDocument(document, wikiContext);
+ } else {
+ LOG.info("XWiki delivered null for document name " + wikiName + ":" + docName);
+ }
+ }
+ return retval;
+ }
+
+ /**
+ * Getting the content(values of title/category/content/extract properties ) from the XWiki.ArticleClass objects
+ * @param document
+ * @param wikiContext
+ */
+ private int addObjectsOfDocument(XWikiDocument document, XWikiContext wikiContext) {
+ int retval = 0;
+ Map xwikiObjects = document.getxWikiObjects();
+ if (document.hasElement(XWikiDocument.HAS_OBJECTS)) {
+ retval += xwikiObjects.size();
+ indexUpdater.addObject(document, wikiContext);
+ }
+ return retval;
+ }
+
+ /**
+ * @param document
+ * @param wikiContext
+ */
+ private int addAttachmentsOfDocument(XWikiDocument document, XWikiContext wikiContext) {
+ int retval = 0;
+ final List attachmentList = document.getAttachmentList();
+ retval += attachmentList.size();
+ for (Iterator attachmentIter = attachmentList.iterator(); attachmentIter.hasNext();) {
+ try {
+ XWikiAttachment attachment = (XWikiAttachment) attachmentIter.next();
+ indexUpdater.add(document, attachment, wikiContext);
+ } catch (Exception e) {
+ LOG.error("error retrieving attachment of document " + document.getFullName(), e);
+ }
+ }
+ return retval;
+ }
+
+ /**
+ * @param document
+ * @param wikiContext
+ * @throws XWikiException
+ */
+ protected int addTranslationsOfDocument(XWikiDocument document, XWikiContext wikiContext) {
+ int retval = 0;
+ List translations;
+ try {
+ translations = document.getTranslationList(wikiContext);
+ } catch (XWikiException e) {
+ LOG.error("error getting list of translations from document " + document.getFullName(), e);
+ e.printStackTrace();
+ return 0;
+ }
+ for (Iterator iter = translations.iterator(); iter.hasNext();) {
+ String lang = (String) iter.next();
+ try {
+ indexUpdater.add(document.getTranslatedDocument(lang, wikiContext), wikiContext);
+ retval++;
+ } catch (XWikiException e1) {
+ LOG.error("error getting translated document for document " + document.getFullName()
+ + " and language " + lang);
+ e1.printStackTrace();
+ }
+ }
+ return retval;
+ }
+
+ /**
+ * @param wiki
+ * @return
+ */
+ private Collection findWikiServers(XWiki wiki, XWikiContext context) {
+ List retval = new ArrayList();
+ final String hql = ", BaseObject as obj, StringProperty as prop "
+ + "where doc.fullName=obj.name and obj.className='XWiki.XWikiServerClass'"
+ + " and prop.id.id = obj.id " + "and prop.id.name = 'server'";
+ List result = null;
+ try {
+ result = wiki.getXWiki().getStore().searchDocumentsNames(hql, context);
+ } catch (Exception e) {
+ LOG.error("error getting list of wiki servers!");
+ }
+ if (result != null) {
+ for (Iterator iter = result.iterator(); iter.hasNext();) {
+ String docname = (String) iter.next();
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("possible server name: " + docname);
+ }
+ if (docname.startsWith("XWiki.XWikiServer")) {
+ retval.add(docname.substring("XWiki.XWikiServer".length()).toLowerCase());
+ }
+ }
+ }
+ return retval;
+ }
+
+ public void setIndexUpdater(IndexUpdater indexUpdater) {
+ this.indexUpdater = indexUpdater;
+ }
+}
Property changes on: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/IndexRebuilder.java
___________________________________________________________________
Name: svn:eol-style
+ native
Added: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/IndexUpdater.java
===================================================================
--- xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/IndexUpdater.java 2006-12-06 21:50:31 UTC (rev 1689)
+++ xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/IndexUpdater.java 2006-12-06 22:29:02 UTC (rev 1690)
@@ -0,0 +1,431 @@
+/*
+ *
+ * ===================================================================
+ *
+ * Copyright (c) 2005 Jens Krämer, All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details, published at
+ * http://www.gnu.org/copyleft/gpl.html or in gpl.txt in the
+ * root folder of this distribution.
+ *
+ * Created on 21.01.2005
+ *
+ */
+
+package com.xpn.xwiki.plugin.lucene;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.*;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.log4j.Logger;
+import org.apache.log4j.MDC;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.search.Hits;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.document.Field;
+
+import com.xpn.xwiki.XWiki;
+import com.xpn.xwiki.XWikiContext;
+import com.xpn.xwiki.objects.BaseObject;
+import com.xpn.xwiki.doc.XWikiAttachment;
+import com.xpn.xwiki.doc.XWikiDocument;
+import com.xpn.xwiki.notify.XWikiActionNotificationInterface;
+import com.xpn.xwiki.notify.XWikiDocChangeNotificationInterface;
+import com.xpn.xwiki.notify.XWikiNotificationRule;
+
+/**
+ * @author <a href="mailto:jk at jkraemer.net">Jens Krämer </a>
+ */
+public class IndexUpdater implements Runnable, XWikiDocChangeNotificationInterface,
+ XWikiActionNotificationInterface {
+
+ private static final Logger LOG = Logger.getLogger(IndexUpdater.class);
+
+ /**
+ * Milliseconds of sleep between checks for changed documents
+ */
+ private int indexingInterval = 300000;
+ private boolean exit = false;
+ private IndexWriter writer;
+ private String indexDir;
+ private XWikiDocumentQueue queue = new XWikiDocumentQueue();
+ private Analyzer analyzer;
+ private LucenePlugin plugin;
+ private IndexSearcher searcher;
+ private IndexReader reader;
+
+ private XWikiContext context;
+ private XWiki xwiki;
+
+ static List fields = new ArrayList();
+
+
+ public void doExit() {
+ exit = true;
+ }
+
+ /**
+ * Main loop. Polls the queue for documents to be indexed.
+ *
+ * @see java.lang.Runnable#run()
+ */
+ public void run() {
+ MDC.put("url", "index updating thread");
+
+ while (!exit) {
+ if (queue.isEmpty()) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("IndexUpdater: queue empty, nothing to do");
+ }
+ } else {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("IndexUpdater: documents in queue, start indexing");
+ }
+ // we need a searcher to find old versions of documents
+ openSearcher();
+ openWriter(false);
+ List oldDocs = new ArrayList();
+
+ while (!queue.isEmpty()) {
+ IndexData data = queue.remove();
+
+ try {
+ oldDocs.addAll(getOldIndexDocIds(data));
+ XWikiDocument doc = xwiki.getDocument(data.getFullName(), context);
+ addToIndex(data, doc, context);
+ } catch (Exception e) {
+ LOG.error("error retrieving doc from own context: " + e.getMessage(), e);
+ e.printStackTrace();
+ }
+ }
+ closeWriter();
+ // the following searcher close/open cycle is necessary because
+ // the old reader is not valid for document deletion anymore
+ // after
+ // updating the index
+ closeSearcher();
+ openSearcher();
+ deleteOldDocs(oldDocs);
+ closeSearcher();
+ // readers and searchers should be reopened after index update
+ plugin.openSearchers();
+ }
+ try {
+ Thread.sleep(indexingInterval);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ xwiki.getStore().cleanUp(context);
+ MDC.remove("url");
+ }
+
+ private synchronized void closeSearcher() {
+ try {
+ if (searcher != null) searcher.close();
+ if (reader != null) reader.close();
+ } catch (IOException e) {
+ LOG.error("error closing index searcher", e);
+ e.printStackTrace();
+ } finally {
+ searcher = null;
+ reader = null;
+ }
+ }
+
+ /**
+ * Opens the index reader and searcher used for finding and deleting old
+ * versions of indexed documents.
+ */
+ private synchronized void openSearcher() {
+ try {
+ reader = IndexReader.open(indexDir);
+ searcher = new IndexSearcher(reader);
+ } catch (IOException e) {
+ LOG.error("error opening index searcher", e);
+ e.printStackTrace();
+ }
+ }
+
+ /**
+ * Deletes the documents with the given ids from the index.
+ *
+ * @param oldDocs
+ */
+ private void deleteOldDocs(List oldDocs) {
+ for (Iterator iter = oldDocs.iterator(); iter.hasNext();) {
+ Integer id = (Integer) iter.next();
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("delete doc " + id);
+ }
+ try {
+ reader.deleteDocument(id.intValue());
+ } catch (IOException e1) {
+ LOG.error("error deleting doc " + id, e1);
+ e1.printStackTrace();
+ }
+ }
+ }
+
+ /**
+ * @param data
+ * @return
+ */
+ private Collection getOldIndexDocIds(IndexData data) {
+ List retval = new ArrayList(3);
+ Query query = data.buildQuery();
+ try {
+ Hits hits = searcher.search(query);
+ for (int i = 0; i < hits.length(); i++) {
+ retval.add(new Integer(hits.id(i)));
+ }
+ } catch (IOException e) {
+ LOG.error("error looking for old versions of document " + data + " with query " + query, e);
+ e.printStackTrace();
+ }
+ return retval;
+ }
+
+ /**
+ *
+ */
+ private void openWriter(boolean create) {
+ if (writer != null) {
+ LOG.error("Writer already open and createWriter called");
+ return;
+ }
+ try {
+ // fix for windows by Daniel Cortes:
+ FSDirectory f = FSDirectory.getDirectory(indexDir, false);
+ writer = new IndexWriter(f, analyzer, create);
+ //writer = new IndexWriter (indexDir, analyzer, create);
+ writer.setUseCompoundFile(true);
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("successfully opened index writer : " + indexDir);
+ }
+ } catch (IOException e) {
+ LOG.error("IOException when opening Lucene Index for writing at " + indexDir, e);
+ }
+ }
+
+ /**
+ *
+ */
+ private void closeWriter() {
+ if (writer == null) {
+ LOG.error("Writer not open and closeWriter called");
+ return;
+ }
+ try {
+ writer.optimize();
+ } catch (IOException e1) {
+ LOG.error("Exception caught when optimizing Index", e1);
+ }
+ try {
+ writer.close();
+ } catch (Exception e) {
+ LOG.error("Exception caught when closing IndexWriter", e);
+ }
+ writer = null;
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("closed writer.");
+ }
+
+ }
+
+ /**
+ * @param doc
+ * @throws IOException
+ */
+ private void addToIndex(IndexData data, XWikiDocument doc, XWikiContext context) throws IOException {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("addToIndex: " + data);
+ }
+ org.apache.lucene.document.Document luceneDoc = new org.apache.lucene.document.Document();
+ data.addDataToLuceneDocument(luceneDoc, doc, context);
+ Field fld = null;
+ // collecting all the fields for using up in search
+ for (Enumeration e = luceneDoc.fields(); e.hasMoreElements();) {
+ fld = (Field) e.nextElement();
+ if (!fields.contains(fld.name())) {
+ fields.add(fld.name());
+ }
+ }
+ writer.addDocument(luceneDoc);
+ }
+
+ /**
+ * @param indexDir The indexDir to set.
+ */
+ public void setIndexDir(String indexDir) {
+ this.indexDir = indexDir;
+ }
+
+ /**
+ * @param analyzer The analyzer to set.
+ */
+ public void setAnalyzer(Analyzer analyzer) {
+ this.analyzer = analyzer;
+ }
+
+ /**
+ * @param config
+ */
+ public synchronized void init(Properties config, LucenePlugin plugin, XWiki wiki) {
+ this.xwiki = wiki;
+ this.context = new XWikiContext();
+ this.context.setWiki(xwiki);
+ this.context.setDatabase(xwiki.getDatabase());
+ this.plugin = plugin;
+ // take the first configured index dir as the one for writing
+ String[] indexDirs = StringUtils.split(config.getProperty(LucenePlugin.PROP_INDEX_DIR), " ,");
+ if (indexDirs != null && indexDirs.length > 0) {
+ this.indexDir = indexDirs[0];
+ File f = new File(indexDir);
+ if (!f.isDirectory()) {
+ f.mkdirs();
+ cleanIndex();
+ }
+ }
+ indexingInterval = 1000 * Integer.parseInt(config.getProperty(LucenePlugin.PROP_INDEXING_INTERVAL,
+ "300"));
+ openSearcher();
+ }
+
+ /**
+ *
+ */
+ public void cleanIndex() {
+ LOG.info("trying to clear index for rebuilding");
+ while (writer != null) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("waiting for existing index writer to close");
+ }
+ try {
+ Thread.sleep(1000);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ synchronized (this) {
+ openWriter(true);
+ closeWriter();
+ }
+ }
+
+ /**
+ * @param document
+ */
+ public void add(XWikiDocument document, XWikiContext context) {
+ queue.add(new DocumentData(document, context));
+ if (document.hasElement(XWikiDocument.HAS_OBJECTS)) {
+ addObject(document, context);
+ }
+ }
+
+ /**
+ * @param document
+ * @param context
+ */
+ public void addObject(XWikiDocument document, XWikiContext context) {
+ queue.add(new ObjectData(document, context));
+ }
+
+ /**
+ * @param attachment
+ */
+ public void add(XWikiDocument document, XWikiAttachment attachment, XWikiContext context) {
+ if (document != null && attachment != null && context != null)
+ queue.add(new AttachmentData(document, attachment, context));
+ else
+ LOG.error("invalid parameters given to add: " + document + ", " + attachment + ", " + context);
+ }
+
+
+ public int addAttachmentsOfDocument(XWikiDocument document, XWikiContext context) {
+ int retval = 0;
+ final List attachmentList = document.getAttachmentList();
+ retval += attachmentList.size();
+ for (Iterator attachmentIter = attachmentList.iterator(); attachmentIter.hasNext();) {
+ try {
+ XWikiAttachment attachment = (XWikiAttachment) attachmentIter.next();
+ add(document, attachment, context);
+ } catch (Exception e) {
+ LOG.error("error retrieving attachment of document " + document.getFullName(), e);
+ }
+ }
+ return retval;
+ }
+
+
+ /**
+ * Notification of changes in document content
+ *
+ * @see com.xpn.xwiki.notify.XWikiNotificationInterface#notify(com.xpn.xwiki.notify.XWikiNotificationRule,
+ *com.xpn.xwiki.doc.XWikiDocument,com.xpn.xwiki.doc.XWikiDocument,
+ *int,com.xpn.xwiki.XWikiContext)
+ */
+ public void notify(XWikiNotificationRule rule, XWikiDocument newDoc, XWikiDocument oldDoc, int event,
+ XWikiContext context) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("notify from XWikiDocChangeNotificationInterface, event=" + event + ", newDoc="
+ + newDoc + " oldDoc=" + oldDoc);
+ }
+ try {
+ add(newDoc, context);
+ } catch (Exception e) {
+ LOG.error("error in notify", e);
+ }
+ }
+
+ /**
+ * Notification of attachment uploads.
+ *
+ * @see com.xpn.xwiki.notify.XWikiActionNotificationInterface#notify(com.xpn.xwiki.notify.XWikiNotificationRule,
+ *com.xpn.xwiki.doc.XWikiDocument,java.lang.String,
+ *com.xpn.xwiki.XWikiContext)
+ */
+ public void notify(XWikiNotificationRule arg0, XWikiDocument doc, String action, XWikiContext context) {
+ if ("upload".equals(action)) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("upload action notification for doc " + doc.getName());
+ }
+ try {
+ List attachments = doc.getAttachmentList();
+ // find out the most recently changed attachment
+ XWikiAttachment newestAttachment = null;
+ for (Iterator iter = attachments.iterator(); iter.hasNext();) {
+ XWikiAttachment attachment = (XWikiAttachment) iter.next();
+ if (newestAttachment != null
+ && attachment.getDate().before(newestAttachment.getDate()))
+ newestAttachment = attachment;
+ else
+ newestAttachment = attachment;
+ }
+ add(doc, newestAttachment, context);
+ } catch (Exception e) {
+ LOG.error("error in notify", e);
+ }
+ }
+ }
+
+ public long getQueueSize() {
+ return queue.getSize();
+ }
+}
Property changes on: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/IndexUpdater.java
___________________________________________________________________
Name: svn:eol-style
+ native
Added: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/LucenePlugin.java
===================================================================
--- xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/LucenePlugin.java 2006-12-06 21:50:31 UTC (rev 1689)
+++ xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/LucenePlugin.java 2006-12-06 22:29:02 UTC (rev 1690)
@@ -0,0 +1,313 @@
+/*
+ *
+ * ===================================================================
+ *
+ * Copyright (c) 2005 Jens Krämer, All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details, published at
+ * http://www.gnu.org/copyleft/gpl.html or in gpl.txt in the
+ * root folder of this distribution.
+ *
+ * Created on 21.01.2005
+ *
+ */
+package com.xpn.xwiki.plugin.lucene;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Properties;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.log4j.Logger;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.queryParser.ParseException;
+import org.apache.lucene.queryParser.QueryParser;
+import org.apache.lucene.queryParser.MultiFieldQueryParser;
+import org.apache.lucene.search.*;
+
+import com.xpn.xwiki.XWikiContext;
+import com.xpn.xwiki.doc.XWikiDocument;
+import com.xpn.xwiki.doc.XWikiAttachment;
+import com.xpn.xwiki.api.Api;
+import com.xpn.xwiki.api.XWiki;
+import com.xpn.xwiki.notify.DocChangeRule;
+import com.xpn.xwiki.notify.XWikiActionRule;
+import com.xpn.xwiki.plugin.XWikiDefaultPlugin;
+import com.xpn.xwiki.plugin.XWikiPluginInterface;
+
+public class LucenePlugin extends XWikiDefaultPlugin implements XWikiPluginInterface {
+ public static final String DOCTYPE_WIKIPAGE = "wikipage";
+ public static final String DOCTYPE_ATTACHMENT = "attachment";
+
+ private static final Logger LOG = Logger.getLogger(LucenePlugin.class);
+ private Analyzer analyzer;
+ private IndexUpdater indexUpdater;
+ private Thread indexUpdaterThread;
+ protected Properties config;
+
+ public static final String PROP_INDEX_DIR = "xwiki.plugins.lucene.indexdir";
+ public static final String PROP_ANALYZER = "xwiki.plugins.lucene.analyzer";
+ public static final String PROP_INDEXING_INTERVAL = "xwiki.plugins.lucene.indexinterval";
+
+ private static final String DEFAULT_ANALYZER = "org.apache.lucene.analysis.de.GermanAnalyzer";
+ private Searcher[] searchers;
+ private String indexDirs;
+ private IndexRebuilder indexRebuilder;
+
+ public LucenePlugin(String name, String className, XWikiContext context) {
+ super(name, className, context);
+ init(context);
+ }
+
+ /**
+ * @see java.lang.Object#finalize()
+ */
+ protected void finalize() throws Throwable {
+ if (indexUpdater != null) indexUpdater.doExit();
+ super.finalize();
+ }
+
+ public synchronized int rebuildIndex(com.xpn.xwiki.api.XWiki wiki, XWikiContext context) {
+ return indexRebuilder.rebuildIndex(wiki, context);
+ }
+
+ /**
+ * Allows to search special named lucene indexes without having to configure
+ * them in xwiki.cfg. Slower than {@link #getSearchResults}since
+ * new index searcher instances are created for every query.
+ *
+ * @param query query string
+ * @param myIndexDirs comma separated list of directories containing the lucene
+ * indexes to search.
+ * @param languages comma separated list of language codes to search in, may be
+ * null to search all languages
+ * @param wiki
+ * @return
+ * @throws Exception
+ */
+ public SearchResults getSearchResultsFromIndexes(String query, String myIndexDirs, String languages,
+ XWiki wiki) throws Exception {
+ Searcher[] mySearchers = createSearchers(myIndexDirs);
+ SearchResults retval = search(query, null, languages, mySearchers, wiki);
+ closeSearchers(mySearchers);
+ return retval;
+ }
+
+ /**
+ * Searches all Indexes configured in xwiki.cfg (property
+ * <code>xwiki.plugins.lucene.indexdir</code>)
+ *
+ * @param query query String entered into a search form
+ * @param wiki XWiki
+ * @param virtualWikiNames Name of the virtual Wiki to search, global search when null
+ * @param languages comma separated list of language codes to search in, may be
+ * null to search all languages
+ * @return Searchresults as a collection of Maps
+ * @throws Exception in case of error(s)
+ */
+ public SearchResults getSearchResults(String query, String virtualWikiNames, String languages, XWiki wiki)
+ throws Exception {
+ return search(query, virtualWikiNames, languages, this.searchers, wiki);
+ }
+
+ /**
+ * @param query
+ * @param indexes
+ * @param virtualWikiNames comma separated list of virtual wiki names to search in, may
+ * be null to search all virtual wikis
+ * @param languages comma separated list of language codes to search in, may be
+ * null to search all languages
+ * @return
+ * @throws IOException
+ * @throws ParseException
+ */
+ private SearchResults search(String query, String virtualWikiNames, String languages,
+ Searcher[] indexes, XWiki wiki) throws IOException, ParseException {
+ MultiSearcher searcher = new MultiSearcher(indexes);
+ Query q = buildQuery(query, virtualWikiNames, languages);
+ Hits hits = searcher.search(q);
+ final int hitcount = hits.length();
+ if (LOG.isDebugEnabled()) LOG.debug("query " + q + " returned " + hitcount + " hits");
+ return new SearchResults(hits, wiki);
+ }
+
+ /**
+ * @param query
+ * @param virtualWikiNames comma separated list of virtual wiki names
+ * @param languages comma separated list of language codes to search in, may be
+ * null to search all languages
+ * @throws ParseException
+ */
+ private Query buildQuery(String query, String virtualWikiNames, String languages) throws ParseException {
+ // build a query like this: <user query string> AND <wikiNamesQuery> AND
+ // <languageQuery>
+ BooleanQuery bQuery = new BooleanQuery();
+ Query parsedQuery = null;
+
+ // for object search
+ if (query.contains(":")) {
+ String property = query.substring(0, query.indexOf(":"));
+ query = query.substring(query.indexOf(":") + 1, query.length());
+ QueryParser qp = new QueryParser(property, analyzer);
+ parsedQuery = qp.parse(query);
+ bQuery.add(parsedQuery, BooleanClause.Occur.MUST);
+ return bQuery;
+ }
+
+ //for fulltext search
+ List fieldList = IndexUpdater.fields;
+ String[] fields = (String[]) fieldList.toArray(new String[fieldList.size()]);
+ BooleanClause.Occur[] flags = new BooleanClause.Occur[fields.length];
+ for (int i = 0; i < flags.length; i++) {
+ flags[i] = BooleanClause.Occur.SHOULD;
+ }
+ parsedQuery = MultiFieldQueryParser.parse(query, fields, flags, analyzer);
+ bQuery.add(parsedQuery, BooleanClause.Occur.MUST);
+
+ if (virtualWikiNames != null && virtualWikiNames.length() > 0) {
+ bQuery.add(buildOredTermQuery(virtualWikiNames, IndexFields.DOCUMENT_WIKI), BooleanClause.Occur.SHOULD);
+ }
+ if (languages != null && languages.length() > 0) {
+ bQuery.add(buildOredTermQuery(languages, IndexFields.DOCUMENT_LANGUAGE), BooleanClause.Occur.SHOULD);
+ }
+ return bQuery;
+ }
+
+ /**
+ * @param values comma separated list of values to look for
+ * @return A query returning documents matching one of the given values in
+ * the given field
+ */
+ private Query buildOredTermQuery(final String values, final String fieldname) {
+ String[] valueArray = values.split("\\,");
+ if (valueArray.length > 1) {
+ // build a query like this: <valueArray[0]> OR <valueArray[1]> OR ...
+ BooleanQuery orQuery = new BooleanQuery();
+ for (int i = 0; i < valueArray.length; i++) {
+ orQuery.add(new TermQuery(new Term(fieldname, valueArray[i].trim())), BooleanClause.Occur.SHOULD);
+ }
+ return orQuery;
+ }
+ // exactly one value, no OR'ed Terms necessary
+ return new TermQuery(new Term(fieldname, valueArray[0]));
+ }
+
+ public synchronized void init(XWikiContext context) {
+ super.init(context);
+ if (LOG.isDebugEnabled()) LOG.debug("lucene plugin: in init");
+ config = context.getWiki().getConfig();
+ try {
+ analyzer = (Analyzer) Class.forName(config.getProperty(PROP_ANALYZER, DEFAULT_ANALYZER))
+ .newInstance();
+ } catch (Exception e) {
+ e.printStackTrace();
+ LOG.error("error instantiating analyzer : ", e);
+ LOG.warn("using default analyzer class: " + DEFAULT_ANALYZER);
+ try {
+ analyzer = (Analyzer) Class.forName(DEFAULT_ANALYZER).newInstance();
+ } catch (Exception e1) {
+ e1.printStackTrace();
+ throw new RuntimeException("instantiation of default analyzer " + DEFAULT_ANALYZER
+ + " failed", e1);
+ }
+ }
+ this.indexDirs = config.getProperty(PROP_INDEX_DIR);
+ openSearchers();
+ indexUpdater = new IndexUpdater();
+ indexUpdater.setAnalyzer(analyzer);
+ indexUpdater.init(config, this, context.getWiki());
+ indexUpdaterThread = new Thread(indexUpdater);
+ indexUpdaterThread.start();
+ indexRebuilder = new IndexRebuilder();
+ indexRebuilder.setIndexUpdater(indexUpdater);
+ context.getWiki().getNotificationManager().addGeneralRule(new DocChangeRule(indexUpdater));
+ context.getWiki().getNotificationManager().addGeneralRule(new XWikiActionRule(indexUpdater));
+ LOG.info("lucene plugin initialized.");
+ }
+
+ public String getName() {
+ return "lucene";
+ }
+
+ public Api getPluginApi(XWikiPluginInterface plugin, XWikiContext context) {
+ return new LucenePluginApi((LucenePlugin) plugin, context);
+ }
+
+ /**
+ * Creates an array of Searchers for a number of lucene indexes.
+ *
+ * @param indexDirs Comma separated list of Lucene index directories to create
+ * searchers for.
+ * @return Array of searchers
+ * @throws Exception
+ */
+ public static Searcher[] createSearchers(String indexDirs) throws Exception {
+ String[] dirs = StringUtils.split(indexDirs, " ,");
+ List searchersList = new ArrayList();
+ for (int i = 0; i < dirs.length; i++) {
+ try {
+ IndexReader reader = IndexReader.open(dirs[i]);
+ searchersList.add(new IndexSearcher(reader));
+ } catch (IOException e) {
+ LOG.error("cannot open index " + dirs[i], e);
+ e.printStackTrace();
+ }
+ }
+ return (Searcher[]) searchersList.toArray(new Searcher[searchersList.size()]);
+ }
+
+ /**
+ * Opens the searchers for the configured index Dirs after closing any
+ * already existing ones.
+ */
+ protected synchronized void openSearchers() {
+ try {
+ closeSearchers(this.searchers);
+ this.searchers = createSearchers(indexDirs);
+ } catch (Exception e1) {
+ LOG.error("error opening searchers for index dirs " + config.getProperty(PROP_INDEX_DIR), e1);
+ throw new RuntimeException("error opening searchers for index dirs "
+ + config.getProperty(PROP_INDEX_DIR), e1);
+ }
+ }
+
+ /**
+ * @throws IOException
+ */
+ protected static void closeSearchers(Searcher[] searchers) throws IOException {
+ if (searchers != null) {
+ for (int i = 0; i < searchers.length; i++) {
+ if (searchers[i] != null) searchers[i].close();
+ }
+ }
+ }
+
+ public long getQueueSize() {
+ return indexUpdater.getQueueSize();
+ }
+
+ public void queueDocument(XWikiDocument doc, XWikiContext context) {
+ indexUpdater.add(doc, context);
+ }
+
+ public void queueAttachment(XWikiDocument doc, XWikiAttachment attach, XWikiContext context) {
+ indexUpdater.add(doc, attach, context);
+ }
+
+ public void queueAttachment(XWikiDocument doc, XWikiContext context) {
+ indexUpdater.addAttachmentsOfDocument(doc, context);
+ }
+
+}
\ No newline at end of file
Property changes on: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/LucenePlugin.java
___________________________________________________________________
Name: svn:eol-style
+ native
Added: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/LucenePluginApi.java
===================================================================
--- xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/LucenePluginApi.java 2006-12-06 21:50:31 UTC (rev 1689)
+++ xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/LucenePluginApi.java 2006-12-06 22:29:02 UTC (rev 1690)
@@ -0,0 +1,197 @@
+/*
+ *
+ * ===================================================================
+ *
+ * Copyright (c) 2005 Jens Krämer, All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details, published at
+ * http://www.gnu.org/copyleft/gpl.html or in gpl.txt in the
+ * root folder of this distribution.
+ *
+ * Created on 21.01.2005
+ *
+ */
+package com.xpn.xwiki.plugin.lucene;
+
+import org.apache.log4j.Logger;
+
+import com.xpn.xwiki.XWikiContext;
+import com.xpn.xwiki.api.Api;
+import com.xpn.xwiki.api.Context;
+
+/**
+ * This is the main interface for using the Plugin. It basically acts as a
+ * facade to the {@link LucenePlugin}class.
+ * <p>
+ * The methods intended for use in wiki pages are
+ * </p>
+ * <ul>
+ * <li>{@link #rebuildIndex(com.xpn.xwiki.api.XWiki, Context)}for rebuilding
+ * the whole index</li>
+ * <li>{@link #getSearchResults(String, String, com.xpn.xwiki.api.XWiki)}for
+ * searching the index</li>
+ * <li>
+ * {@link #getSearchResults(String, String, String, com.xpn.xwiki.api.XWiki)}
+ * for searching specific virtual wikis</li>
+ * <li>and
+ * {@link #getSearchResultsFromIndexes(String, String, String, com.xpn.xwiki.api.XWiki)}
+ * for searching other lucene indexes than thos configured in
+ * <code>xwiki.cfg</code></li>
+ * </ul>
+ * @author <a href="mailto:jk at jkraemer.net">Jens Krämer </a>
+ */
+public class LucenePluginApi extends Api
+{
+ private LucenePlugin plugin;
+ private static final Logger LOG = Logger.getLogger (LucenePluginApi.class);
+
+ public LucenePluginApi (LucenePlugin plugin, XWikiContext context)
+ {
+ super (context);
+ setPlugin (plugin);
+ }
+
+ /**
+ * Starts a rebuild of the whole index.
+ * @param wiki
+ * @param context
+ * @return Number of documents scheduled for indexing. -1 in case of errors
+ */
+ public int rebuildIndex (com.xpn.xwiki.api.XWiki wiki, Context context)
+ {
+ if (wiki.hasAdminRights ())
+ {
+ return getPlugin().rebuildIndex (wiki, context.getContext());
+ }
+ LOG.info ("access denied to rebuildIndex: insufficient rights");
+ return -1;
+ }
+
+ /**
+ * Searches the named indexes using the given query for documents in the
+ * given languages
+ * @param query
+ * the query entered by the user
+ * @param indexDirs
+ * comma separated list of lucene index directories to search in
+ * @param languages
+ * comma separated list of language codes to search in, may be
+ * null to search all languages
+ * @param wiki
+ * reference to xwiki
+ * @return {@link SearchResults}instance containing the results.
+ */
+ public SearchResults getSearchResultsFromIndexes (String query, String indexDirs, String languages,
+ com.xpn.xwiki.api.XWiki wiki)
+ {
+ try
+ {
+ return getPlugin ().getSearchResults (query, indexDirs, languages, wiki);
+ } catch (Exception e)
+ {
+ e.printStackTrace ();
+ } // end of try-catch
+ return null;
+ }
+
+ /**
+ * Searches the configured Indexes using the specified lucene query for
+ * documents in the given languages.
+ * <p>
+ * With virtual wikis enabled in your xwiki installation this will deliver
+ * results from all virtuall wikis. For searching in a subset of your
+ * virtual wikis see
+ * {@link #getSearchResults(String, String, String, com.xpn.xwiki.api.XWiki)}
+ * </p>
+ * @param query
+ * query entered by the user
+ * @param languages
+ * comma separated list of language codes to search in, may be
+ * null to search all languages. Language codes can be:
+ * <ul>
+ * <li><code>default</code> for content having no specific
+ * language information</li>
+ * <li>lower case 2-letter language codes like <code>en</code>,
+ * <code>de</code> as used by xwiki</li>
+ * </ul>
+ * @return a {@link SearchResults}instance containing the results.
+ */
+ public SearchResults getSearchResults (String query, String languages, com.xpn.xwiki.api.XWiki wiki)
+ {
+ return getSearchResultsFromIndexes (query, null, languages, wiki);
+ }
+
+ /**
+ * Searches the configured Indexes using the specified lucene query for
+ * documents in the given languages belonging to one of the given virtual
+ * wikis.
+ * <p>
+ * Using this method only makes sense with virtual wikis enabled. Otherwise
+ * use {@link #getSearchResults(String, String, com.xpn.xwiki.api.XWiki)}
+ * instead.
+ * </p>
+ * @param query
+ * query entered by the user
+ * @param virtualWikiNames
+ * Names of the virtual wikis to search in. May be null for
+ * global search.
+ * @param languages
+ * comma separated list of language codes to search in, may be
+ * null to search all languages. Language codes can be:
+ * <ul>
+ * <li><code>default</code> for content having no specific
+ * language information</li>
+ * <li>lower case 2-letter language codes like <code>en</code>,
+ * <code>de</code> as used by xwiki</li>
+ * </ul>
+ * @return a {@link SearchResults}instance containing the results.
+ */
+ public SearchResults getSearchResults (String query, String virtualWikiNames, String languages,
+ com.xpn.xwiki.api.XWiki wiki)
+ {
+ try
+ {
+ SearchResults retval = getPlugin ().getSearchResults (query, virtualWikiNames, languages, wiki);
+ if (LOG.isDebugEnabled ()) LOG.debug ("returning " + retval.getHitcount () + " results");
+ return retval;
+ } catch (Exception e)
+ {
+ e.printStackTrace ();
+ }
+ return null;
+ }
+
+ /*
+ @return the number of documents in the queue
+ */
+ public long getQueueSize() {
+ return plugin.getQueueSize();
+ }
+
+
+ /**
+ * @param plugin
+ * plugin instance we are the facade for.
+ */
+ public void setPlugin (LucenePlugin plugin)
+ {
+ this.plugin = plugin;
+ }
+
+ /**
+ * @return the plugin instance we are the facade for.
+ */
+ public LucenePlugin getPlugin ()
+ {
+ return this.plugin;
+ }
+
+}
\ No newline at end of file
Property changes on: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/LucenePluginApi.java
___________________________________________________________________
Name: svn:eol-style
+ native
Added: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/ObjectData.java
===================================================================
--- xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/ObjectData.java 2006-12-06 21:50:31 UTC (rev 1689)
+++ xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/ObjectData.java 2006-12-06 22:29:02 UTC (rev 1690)
@@ -0,0 +1,209 @@
+/*
+ *
+ * ===================================================================
+ *
+ * Copyright (c) 2005 Jens Kr�mer, All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details, published at
+ * http://www.gnu.org/copyleft/gpl.html or in gpl.txt in the
+ * root folder of this distribution.
+ *
+ * Created on 25.01.2005
+ *
+ * @author Lokesh (N.Lokeswara Reddy) Congruent Solutions.Pvt.Ltd.
+ */
+package com.xpn.xwiki.plugin.lucene;
+
+import org.apache.log4j.Logger;
+import org.apache.lucene.document.Field;
+import com.xpn.xwiki.doc.XWikiDocument;
+import com.xpn.xwiki.XWikiContext;
+import com.xpn.xwiki.objects.BaseProperty;
+import com.xpn.xwiki.objects.BaseObject;
+import com.xpn.xwiki.objects.PropertyInterface;
+import com.xpn.xwiki.objects.classes.BaseClass;
+import com.xpn.xwiki.objects.classes.StaticListClass;
+import com.xpn.xwiki.objects.classes.ListClass;
+import com.xpn.xwiki.objects.classes.ListItem;
+
+import java.util.List;
+import java.util.Map;
+import java.util.Iterator;
+
+/**
+ * Hold the property values of the XWiki.ArticleClass Objects.
+ */
+public class ObjectData extends IndexData {
+
+ private static final Logger LOG = Logger.getLogger(ObjectData.class);
+
+ public ObjectData(final XWikiDocument doc, final XWikiContext context) {
+ super(doc, context);
+ setAuthor(doc.getAuthor());
+ setCreator(doc.getCreator());
+ setModificationDate(doc.getDate());
+ setCreationDate(doc.getCreationDate());
+ }
+
+
+ /**
+ * @see net.jkraemer.xwiki.plugins.lucene.IndexData#getType()
+ */
+ public String getType() {
+ return LucenePlugin.DOCTYPE_WIKIPAGE;
+ }
+
+ /**
+ * @return a string containing the result of
+ * {@link IndexData#getFullText(XWikiDocument,XWikiContext)}plus
+ * the full text content (values of title,category,content and extract ) XWiki.ArticleClass Object, as far as it could be
+ * extracted.
+ */
+ public String getFullText(XWikiDocument doc, XWikiContext context) {
+ StringBuffer retval = new StringBuffer(super.getFullText(doc, context));
+ String contentText = getContentAsText(doc, context);
+ if (contentText != null) {
+ retval.append(" ").append(contentText).toString();
+ }
+ return retval.toString();
+ }
+
+ /**
+ * @param doc
+ * @param context
+ * @return string containing value of title,category,content and extract of XWiki.ArticleClass
+ */
+ private String getContentAsText(XWikiDocument doc, XWikiContext context) {
+ StringBuffer contentText = new StringBuffer();
+ try {
+ LOG.info(doc.getFullName());
+ Map objects = doc.getxWikiObjects();
+ Iterator itKey = objects.keySet().iterator();
+ while (itKey.hasNext()) {
+ String className = (String) itKey.next();
+ Iterator itObj = doc.getObjects(className).iterator();
+ while (itObj.hasNext())
+ extractContent(contentText, (BaseObject) itObj.next(), context);
+ }
+
+ } catch (Exception e) {
+ LOG.error("error getting content from XWiki Objects ", e);
+ e.printStackTrace();
+ }
+ return contentText.toString();
+ }
+
+ private void extractContent(StringBuffer contentText, BaseObject baseObject, XWikiContext context) {
+ try {
+ if (baseObject != null) {
+ Object[] propertyNames = baseObject.getPropertyNames();
+ for (int i = 0; i < propertyNames.length; i++) {
+ BaseProperty baseProperty = (BaseProperty) baseObject.getField((String) propertyNames[i]);
+ if ((baseProperty != null) && (baseProperty.getValue() != null)) {
+ contentText.append(baseProperty.getValue().toString());
+ }
+ contentText.append(" ");
+ }
+ }
+ } catch (Exception e) {
+ LOG.error("error getting content from XWiki Object ", e);
+ e.printStackTrace();
+ }
+ }
+
+ public void addDataToLuceneDocument(org.apache.lucene.document.Document luceneDoc, XWikiDocument doc,
+ XWikiContext context) {
+
+ super.addDataToLuceneDocument(luceneDoc, doc, context);
+ Map objects = doc.getxWikiObjects();
+ String className;
+ Iterator itObj;
+ BaseObject baseObject;
+ for (Iterator itr = objects.keySet().iterator(); itr.hasNext();) {
+ className = (String) itr.next();
+ itObj = doc.getObjects(className).iterator();
+
+ while (itObj.hasNext()) {
+ baseObject = (BaseObject) itObj.next();
+ Object[] propertyNames = baseObject.getPropertyNames();
+ for (int i = 0; i < propertyNames.length; i++) {
+ try {
+ indexProperty(luceneDoc, baseObject, (String) propertyNames[i], context);
+ } catch (Exception e) {
+ LOG.error("error extracting fulltext for document " + this, e);
+ }
+ }
+ }
+ }
+ }
+
+ private void indexProperty(org.apache.lucene.document.Document luceneDoc, BaseObject baseObject, String propertyName, XWikiContext context) {
+ String fieldFullName = baseObject.getClassName() + "." + propertyName;
+ BaseClass bClass = baseObject.getxWikiClass(context);
+ PropertyInterface prop = bClass.getField(propertyName);
+
+ if (prop instanceof StaticListClass && ((StaticListClass)prop).isMultiSelect()) {
+ indexStaticList(luceneDoc, baseObject, (StaticListClass) prop, propertyName, context);
+ } else {
+ final String ft = getContentAsText(baseObject, propertyName);
+ if (ft != null) {
+ luceneDoc.add(new Field(fieldFullName, ft, Field.Store.YES, Field.Index.TOKENIZED));
+
+ }
+ }
+ }
+
+ private void indexStaticList(org.apache.lucene.document.Document luceneDoc, BaseObject baseObject, StaticListClass prop, String propertyName, XWikiContext context) {
+ Map possibleValues = prop.getMap(context);
+ List keys = baseObject.getListValue(propertyName);
+ String fieldFullName = baseObject.getClassName() + "." + propertyName;
+ Iterator it = keys.iterator();
+ while (it.hasNext()) {
+ String value = (String) it.next();
+ ListItem item = (ListItem) possibleValues.get(value);
+ if (item != null) {
+ // we index the key of the list
+ String fieldName = fieldFullName + ".key";
+ luceneDoc.add(new Field(fieldName, item.getId(), Field.Store.YES, Field.Index.TOKENIZED));
+ //we index the value
+ fieldName = fieldFullName + ".value";
+ luceneDoc.add(new Field(fieldName, item.getValue(), Field.Store.YES, Field.Index.TOKENIZED));
+ if (!item.getId().equals(item.getValue())) {
+ luceneDoc.add(new Field(fieldFullName, item.getValue(), Field.Store.YES, Field.Index.TOKENIZED));
+ }
+ }
+ //we index both if value is not equal to the id(key)
+ luceneDoc.add(new Field(fieldFullName, value, Field.Store.YES, Field.Index.TOKENIZED));
+ }
+ }
+
+ public String getFullText(XWikiDocument doc, BaseObject baseObject, String property, XWikiContext context) {
+ return getContentAsText(baseObject, property);
+ }
+
+ private String getContentAsText(BaseObject baseObject, String property) {
+
+ StringBuffer contentText = new StringBuffer();
+ try {
+ BaseProperty baseProperty;
+ baseProperty = (BaseProperty) baseObject.getField(property);
+ if (baseProperty.getValue() != null) {
+ contentText.append(baseProperty.getValue().toString());
+ }
+ } catch (Exception e) {
+ LOG.error("error getting content from XWiki Objects ", e);
+ e.printStackTrace();
+ }
+ return contentText.toString();
+ }
+
+
+}
Property changes on: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/ObjectData.java
___________________________________________________________________
Name: svn:eol-style
+ native
Added: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/SearchResult.java
===================================================================
--- xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/SearchResult.java 2006-12-06 21:50:31 UTC (rev 1689)
+++ xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/SearchResult.java 2006-12-06 22:29:02 UTC (rev 1690)
@@ -0,0 +1,207 @@
+/*
+ *
+ * ===================================================================
+ *
+ * Copyright (c) 2005 Jens Krämer, All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details, published at
+ * http://www.gnu.org/copyleft/gpl.html or in gpl.txt in the
+ * root folder of this distribution.
+ *
+ * Created on 25.01.2005
+ *
+ */
+
+package com.xpn.xwiki.plugin.lucene;
+import java.util.Date;
+
+import org.apache.log4j.Logger;
+
+import com.xpn.xwiki.XWikiContext;
+import com.xpn.xwiki.XWikiException;
+import com.xpn.xwiki.api.Context;
+import com.xpn.xwiki.api.Document;
+import com.xpn.xwiki.web.XWikiURLFactory;
+
+/**
+ * Result of a search. The Plugin will return a collection of these for display
+ * on the search page.
+ * @author <a href="mailto:jk at jkraemer.net">Jens Krämer </a>
+ */
+public class SearchResult
+{
+ private float score;
+ private String name;
+ private String wiki;
+ private String web;
+ private String url;
+ private String filename;
+ private String type;
+ private String author;
+ private String language;
+ private Date date;
+ private Date creationDate;
+ private String creator;
+ private static final Logger LOG = Logger.getLogger (SearchResult.class);
+
+ /**
+ * @todo add fallback for unknown index field names (read values into a map
+ * accessible from search results page) This would be useful for
+ * integration of external indexes where the field names dont match
+ * ours.
+ * @param doc
+ * @param score
+ * @todo TODO: to be more flexible make a factory to construct different
+ * kinds of searchresults, esp. for external indexes and custom
+ * implementations of searchresults
+ */
+ public SearchResult (org.apache.lucene.document.Document doc, float score, com.xpn.xwiki.api.XWiki xwiki)
+ {
+ this.score = score;
+ name = doc.get (IndexFields.DOCUMENT_NAME);
+ web = doc.get (IndexFields.DOCUMENT_WEB);
+ wiki = doc.get (IndexFields.DOCUMENT_WIKI);
+ type = doc.get (IndexFields.DOCUMENT_TYPE);
+ author = doc.get (IndexFields.DOCUMENT_AUTHOR);
+ creator = doc.get (IndexFields.DOCUMENT_CREATOR);
+ language = doc.get (IndexFields.DOCUMENT_LANGUAGE);
+ date = IndexFields.stringToDate (doc.get (IndexFields.DOCUMENT_DATE));
+ creationDate = IndexFields.stringToDate (doc.get (IndexFields.DOCUMENT_CREATIONDATE));
+ if (LucenePlugin.DOCTYPE_ATTACHMENT.equals (type))
+ {
+ filename = doc.get (IndexFields.FILENAME);
+ Document document;
+ final String fullDocName = new StringBuffer (wiki).append (":").append (web).append (".")
+ .append (name).toString ();
+ try
+ {
+ document = xwiki.getDocument (fullDocName);
+ url = document.getAttachmentURL (filename, "download");
+ } catch (XWikiException e)
+ {
+ LOG.error ("error retrieving url for attachment " + filename + " of document " + fullDocName);
+ e.printStackTrace ();
+ }
+ }
+ }
+
+ /**
+ * @return Returns the name of the user who last modified the document.
+ */
+ public String getAuthor ()
+ {
+ return author;
+ }
+
+ /**
+ * @return Returns the date of last modification.
+ */
+ public Date getDate ()
+ {
+ return date;
+ }
+
+ /**
+ * @return Returns the filename, only used for Attachments (see
+ * {@link #getType()})
+ */
+ public String getFilename ()
+ {
+ return filename;
+ }
+
+ /**
+ * @return Returns the name of the document.
+ */
+ public String getName ()
+ {
+ return name;
+ }
+
+ /**
+ * @return Returns the score of this search result as computed by lucene. Is
+ * a float between zero and 1.
+ */
+ public float getScore ()
+ {
+ return score;
+ }
+
+ /**
+ * @return Returns the type of the document, atm this can be either
+ * <code>wikipage</code> or <code>attachment</code>.
+ */
+ public String getType ()
+ {
+ return type;
+ }
+
+ /**
+ * @return Returns the url to access the document.
+ */
+ public String getUrl ()
+ {
+ return url;
+ }
+
+ /**
+ * @return Returns the web the document belongs to.
+ */
+ public String getWeb ()
+ {
+ return web;
+ }
+
+ /**
+ * @return the language of the Document, i.e. <code>de</code> or
+ * <code>en</code>,<code>default</code> if no language was set
+ * at indexing time.
+ */
+ public String getLanguage ()
+ {
+ return language;
+ }
+
+ /**
+ * @return creationDate of this document
+ */
+ public Date getCreationDate ()
+ {
+ return creationDate;
+ }
+
+ /**
+ * @return Username of the creator of the document
+ */
+ public String getCreator ()
+ {
+ return creator;
+ }
+
+ public void setUrl (String url)
+ {
+ this.url = url;
+ }
+
+ public String getWiki ()
+ {
+ return wiki;
+ }
+
+ /**
+ * @return true when this result points to wiki content (attachment or a
+ * wiki page)
+ */
+ public boolean isWikiContent ()
+ {
+ return (LucenePlugin.DOCTYPE_WIKIPAGE.equals (type) || LucenePlugin.DOCTYPE_ATTACHMENT.equals (type));
+ }
+}
Property changes on: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/SearchResult.java
___________________________________________________________________
Name: svn:eol-style
+ native
Added: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/SearchResults.java
===================================================================
--- xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/SearchResults.java 2006-12-06 21:50:31 UTC (rev 1689)
+++ xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/SearchResults.java 2006-12-06 22:29:02 UTC (rev 1690)
@@ -0,0 +1,205 @@
+/*
+ *
+ * ===================================================================
+ *
+ * Copyright (c) 2005 Jens Krämer, All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details, published at
+ * http://www.gnu.org/copyleft/gpl.html or in gpl.txt in the
+ * root folder of this distribution.
+ *
+ * Created on 01.02.2005
+ *
+ */
+package com.xpn.xwiki.plugin.lucene;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.log4j.Logger;
+import org.apache.lucene.search.Hits;
+
+import com.xpn.xwiki.api.XWiki;
+
+/**
+ * Container for the results of a search.
+ * <p>
+ * This class handles paging through search results and enforces the xwiki
+ * rights management by only returning search results the user executing the
+ * search is allowed to view.
+ * </p>
+ * @author <a href="mailto:jk at jkraemer.net">Jens Krämer </a>
+ */
+public class SearchResults
+{
+ private final XWiki xwiki;
+ private final Hits hits;
+ private static final Logger LOG = Logger.getLogger (SearchResults.class);
+
+ private List relevantResults;
+
+ /**
+ * @param hits
+ * Lucene search results
+ * @param beginIndex
+ * index of first result to show (zero-based)
+ * @param endIndex
+ * index of last result to show
+ * @param xwiki
+ * xwiki instance for access rights checking
+ */
+ public SearchResults (Hits hits, XWiki xwiki)
+ {
+ this.hits = hits;
+ this.xwiki = xwiki;
+ }
+
+ private List getRelevantResults ()
+ {
+ if (relevantResults == null)
+ {
+ relevantResults = new ArrayList ();
+ final int hitcount = hits.length ();
+ for (int i = 0; i < hitcount; i++)
+ {
+ SearchResult result = null;
+ try
+ {
+ result = new SearchResult (hits.doc (i), hits.score (i), xwiki);
+ String pageName = null;
+ if (result.isWikiContent ())
+ pageName = result.getWeb () + "." + result.getName ();
+ if (result != null && result.isWikiContent() && xwiki.checkAccess (pageName, "view") && xwiki.exists(pageName)) {
+ relevantResults.add (result);
+ }
+ } catch (Exception e) {
+ LOG.error ("error getting search result", e);
+ e.printStackTrace ();
+ }
+ }
+ }
+ return relevantResults;
+ }
+
+ /**
+ * @param beginIndex
+ * @param items
+ * @return true when there are more results than currently displayed.
+ */
+ public boolean hasNext (String beginIndex, String items)
+ {
+ final int itemCount = Integer.parseInt (items);
+ final int begin = Integer.parseInt (beginIndex);
+ return begin + itemCount - 1 < getRelevantResults ().size ();
+ }
+
+ /**
+ * @param beginIndex
+ * @return true when there is a page before the one currently displayed,
+ * that is, when <code>beginIndex > 1</code>
+ */
+ public boolean hasPrevious (String beginIndex)
+ {
+ return Integer.parseInt (beginIndex) > 1;
+ }
+
+ /**
+ * @param beginIndex
+ * @param items
+ * @return the value to be used for the firstIndex URL parameter to build a
+ * link pointing to the next page of results
+ */
+ public int getNextIndex (String beginIndex, String items)
+ {
+ final int itemCount = Integer.parseInt (items);
+ final int resultcount = getRelevantResults ().size ();
+ int retval = Integer.parseInt (beginIndex) + itemCount;
+ return retval > resultcount ? (resultcount - itemCount + 1) : retval;
+ }
+
+ /**
+ * @param beginIndex
+ * @param items
+ * @return the value to be used for the firstIndex URL parameter to build a
+ * link pointing to the previous page of results
+ */
+ public int getPreviousIndex (String beginIndex, String items)
+ {
+ int retval = Integer.parseInt (beginIndex) - Integer.parseInt (items);
+ return 0 < retval ? retval : 1;
+ }
+
+ /**
+ * @param beginIndex
+ * @param items
+ * @return the index of the last displayed search result
+ */
+ public int getEndIndex (String beginIndex, String items)
+ {
+ int retval = Integer.parseInt (beginIndex) + Integer.parseInt (items) - 1;
+ final int resultcount = getRelevantResults ().size ();
+ if (retval > resultcount)
+ {
+ return resultcount;
+ }
+ return retval;
+ }
+
+ /**
+ * Helper method for use in velocity templates, takes string values instead
+ * of ints. See {@link #getResults(int, int)}.
+ * @param beginIndex
+ * @param items
+ * @return
+ */
+ public List getResults (String beginIndex, String items)
+ {
+ return getResults (Integer.parseInt (beginIndex), Integer.parseInt (items));
+ }
+
+ /**
+ * Returns a list of search results. According to beginIndex and endIndex,
+ * only a subset of the results is returned. To get the first ten results,
+ * one would use beginIndex=1 and items=10.
+ * @param beginIndex
+ * 1-based index of first result to return.
+ * @param items
+ * number of items to return
+ * @return List of SearchResult instances starting at
+ * <code>beginIndex</code> and containing up to
+ * <code>items</code> elements.
+ */
+ public List getResults (int beginIndex, int items)
+ {
+ final int listStartIndex = beginIndex - 1;
+ final int listEndIndex = listStartIndex + items;
+ final List results = getRelevantResults();
+ final int resultcount = results.size ();
+ return getRelevantResults ().subList (listStartIndex,
+ listEndIndex < resultcount ? listEndIndex : resultcount);
+ }
+
+ /**
+ * @return all search results in one list.
+ */
+ public List getResults ()
+ {
+ return getRelevantResults ();
+ }
+
+ /**
+ * @return total number of searchresults the user is allowed to view
+ */
+ public int getHitcount ()
+ {
+ return getRelevantResults ().size ();
+ }
+}
Property changes on: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/SearchResults.java
___________________________________________________________________
Name: svn:eol-style
+ native
Added: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/TextExtractor.java
===================================================================
--- xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/TextExtractor.java 2006-12-06 21:50:31 UTC (rev 1689)
+++ xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/TextExtractor.java 2006-12-06 22:29:02 UTC (rev 1690)
@@ -0,0 +1,81 @@
+/*
+ *
+ * ===================================================================
+ *
+ * Copyright (c) 2005 Jens Krämer, All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details, published at
+ * http://www.gnu.org/copyleft/gpl.html or in gpl.txt in the
+ * root folder of this distribution.
+ *
+ * Created on 25.01.2005
+ *
+ */
+
+package com.xpn.xwiki.plugin.lucene;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.log4j.Logger;
+import com.xpn.xwiki.plugin.lucene.textextraction.*;
+
+/**
+ * Extraction of text from various binary formats. Extraction itself is done by
+ * the textExtractor classes in Packages below <code>org.outerj.daisy</code>
+ * taken from the <a href="http://new.cocoondev.org/daisy">Daisy project </a>.
+ * @author <a href="mailto:jk at jkraemer.net">Jens Krämer </a>
+ */
+public class TextExtractor
+{
+ private static final Logger LOG = Logger.getLogger (TextExtractor.class);
+
+ static final Map textExtractors = new HashMap ();
+ static
+ {
+ // TODO: make text extractors more pluggable by moving this into a config file.
+ final XmlTextExtractor xmlTextExtractor = new XmlTextExtractor ();
+ textExtractors.put ("application/xhtml+xml", xmlTextExtractor);
+ textExtractors.put ("text/xml", xmlTextExtractor);
+ textExtractors.put ("text/plain", new PlainTextExtractor());
+ textExtractors.put ("application/pdf", new PDFTextExtractor());
+// textExtractors.put ("application/vnd.sun.xml.writer", new OpenOfficeTextExtractor ());
+ textExtractors.put ("application/msword", new MSWordTextExtractor ());
+ textExtractors.put ("application/ms-powerpoint", new MSPowerPointTextExtractor());
+ textExtractors.put ("application/ms-excel", new MSExcelTextExtractor());
+ }
+
+ /**
+ * @param content
+ * @param mimetype
+ * @return
+ */
+ public static String getText (byte[] content, String mimetype)
+ {
+ final MimetypeTextExtractor extractor = (MimetypeTextExtractor) textExtractors.get (mimetype);
+ if (extractor != null)
+ {
+ try
+ {
+ return extractor.getText (content);
+ } catch (Exception e)
+ {
+ LOG.error ("error getting text for mimetype " + mimetype, e);
+ e.printStackTrace ();
+ }
+ } else
+ {
+ LOG.info ("no text extractor for mimetype " + mimetype);
+ }
+ return null;
+ }
+
+}
Property changes on: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/TextExtractor.java
___________________________________________________________________
Name: svn:eol-style
+ native
Added: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/XWikiDocumentQueue.java
===================================================================
--- xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/XWikiDocumentQueue.java 2006-12-06 21:50:31 UTC (rev 1689)
+++ xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/XWikiDocumentQueue.java 2006-12-06 22:29:02 UTC (rev 1690)
@@ -0,0 +1,68 @@
+/*
+ *
+ * ===================================================================
+ *
+ * Copyright (c) 2005 Jens Krämer, All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details, published at
+ * http://www.gnu.org/copyleft/gpl.html or in gpl.txt in the
+ * root folder of this distribution.
+ *
+ * Created on 24.01.2005
+ *
+ */
+
+package com.xpn.xwiki.plugin.lucene;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.commons.collections.Buffer;
+import org.apache.commons.collections.buffer.UnboundedFifoBuffer;
+
+import com.xpn.xwiki.doc.XWikiDocument;
+
+/**
+ * @author <a href="mailto:jk at jkraemer.net">Jens Krämer </a>
+ */
+public class XWikiDocumentQueue
+{
+ /** maps names of documents to the document instances itself */
+ private Map documentsByName = new HashMap ();
+ /** maintains fifo order */
+ private Buffer namesQueue = new UnboundedFifoBuffer ();
+
+ public synchronized IndexData remove ()
+ {
+ return (IndexData) documentsByName.remove (namesQueue.remove ());
+ }
+
+ public synchronized void add (IndexData data)
+ {
+ final String key = data.toString ();
+ if (!documentsByName.containsKey (key))
+ {
+ // document with this name not yet in Queue, so add it
+ namesQueue.add (key);
+ }
+ // in any case put new version of this document in the map, overwriting
+ // possibly existing older version
+ documentsByName.put (key, data);
+ }
+
+ public synchronized boolean isEmpty ()
+ {
+ return namesQueue.isEmpty ();
+ }
+
+ public long getSize() {
+ return namesQueue.size();
+ }
+}
Property changes on: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/XWikiDocumentQueue.java
___________________________________________________________________
Name: svn:eol-style
+ native
Added: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/textextraction/MSExcelTextExtractor.java
===================================================================
--- xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/textextraction/MSExcelTextExtractor.java 2006-12-06 21:50:31 UTC (rev 1689)
+++ xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/textextraction/MSExcelTextExtractor.java 2006-12-06 22:29:02 UTC (rev 1690)
@@ -0,0 +1,213 @@
+package com.xpn.xwiki.plugin.lucene.textextraction;
+import org.apache.poi.hssf.usermodel.*;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.log4j.Logger;
+
+import java.io.ByteArrayInputStream;
+import java.text.SimpleDateFormat;
+import java.text.DecimalFormat;
+import java.util.Date;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: lokesh
+ * Date: Oct 17, 2006
+ * Time: 4:49:10 PM
+ * To change this template use File | Settings | File Templates.
+ */
+public class MSExcelTextExtractor implements MimetypeTextExtractor {
+
+ private static final Logger LOG = Logger.getLogger(MSExcelTextExtractor.class);
+ /**
+ * The currently preparing Excel workbook.
+ */
+ private HSSFWorkbook mWorkbook;
+
+ /**
+ * Contains all data formats used in the currently preparing Excel workbook.
+ */
+ private HSSFDataFormat mDataFormat;
+
+ public static final int DEFAULT_BUFFER_SIZE = 16384; // 16 k
+
+ /**
+ * Extracts all text from an Excel by parsing all the sheets in that excel document.
+ * @param data
+ * @return String
+ * @throws Exception
+ */
+ public String getText(byte[] data) throws Exception {
+
+ POIFSFileSystem poiFs = new POIFSFileSystem(new ByteArrayInputStream(data));
+ mWorkbook = new HSSFWorkbook(poiFs);
+ mDataFormat = mWorkbook.createDataFormat();
+
+ StringBuffer cleanBuffer = new StringBuffer(DEFAULT_BUFFER_SIZE);
+ for (int sheetIdx = 0; sheetIdx < mWorkbook.getNumberOfSheets(); sheetIdx++) {
+ HSSFSheet sheet = mWorkbook.getSheetAt(sheetIdx);
+
+ if (sheet != null) {
+ parseSheet(sheet, cleanBuffer);
+ }
+ }
+ return cleanBuffer.toString();
+ }
+
+ /**
+ * It will parse the sheet with row wise and get the text from the sheet.
+ * @param sheet
+ * @param cleanBuffer
+ */
+
+ private void parseSheet(HSSFSheet sheet, StringBuffer cleanBuffer) {
+ int firstRow = sheet.getFirstRowNum();
+ int lastRow = sheet.getLastRowNum();
+ for (int rowIdx = firstRow; rowIdx <= lastRow; rowIdx++) {
+ HSSFRow row = sheet.getRow(rowIdx);
+
+ if (row != null) {
+ parseRow(row, cleanBuffer);
+ }
+ }
+ }
+
+ /**
+ * It will parse row and return the text
+ * @param row
+ * @param cleanBuffer
+ */
+ private void parseRow(HSSFRow row, StringBuffer cleanBuffer) {
+ short firstCell = row.getFirstCellNum();
+ short lastCell = row.getLastCellNum();
+ for (short cellIdx = firstCell; cellIdx <= lastCell; cellIdx++) {
+ HSSFCell cell = row.getCell(cellIdx);
+
+ if (cell != null) {
+ parseCell(cell, cleanBuffer);
+ }
+ }
+ }
+
+ /**
+ * Extracts all text from each cell of the sheet
+ * @param cell
+ * @param cleanBuffer
+ */
+ private void parseCell(HSSFCell cell, StringBuffer cleanBuffer) {
+ String cellValue = null;
+
+ if (cell.getCellType() == HSSFCell.CELL_TYPE_STRING) {
+ cellValue = cell.getStringCellValue();
+ } else if (cell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {
+ HSSFCellStyle style = cell.getCellStyle();
+ short formatId = style.getDataFormat();
+ String formatPattern = mDataFormat.getFormat(formatId);
+ formatPattern = replace(formatPattern, "\\ ", " ");
+
+ if (isCellDateFormatted(cell)) {
+ // This is a date
+ formatPattern = replace(formatPattern, "mmmm", "MMMM");
+ formatPattern = replace(formatPattern, "/", ".");
+ SimpleDateFormat format;
+ try {
+ format = new SimpleDateFormat(formatPattern);
+ }
+ catch (Throwable thr) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Creating date format failed: '" + formatPattern + "'", thr);
+ }
+ format = new SimpleDateFormat();
+ }
+
+ double numberValue = cell.getNumericCellValue();
+ Date date = HSSFDateUtil.getJavaDate(numberValue);
+ cellValue = format.format(date);
+ } else {
+ // This is a Number
+ DecimalFormat format;
+ try {
+ format = new DecimalFormat(formatPattern);
+ }
+ catch (Throwable thr) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Creating number format failed: '" + formatPattern + "'", thr);
+ }
+ format = new DecimalFormat();
+ }
+
+ double numberValue = cell.getNumericCellValue();
+ cellValue = format.format(numberValue);
+ }
+ }
+
+ if (cellValue != null) {
+ cellValue = cellValue.trim();
+ if (cellValue.length() != 0) {
+ cleanBuffer.append(cellValue);
+ cleanBuffer.append(" ");
+ }
+ }
+ }
+
+ /**
+ * Checks cell is date formatted or not.
+ * @param cell
+ * @return boolean
+ */
+ private boolean isCellDateFormatted(HSSFCell cell) {
+ short format = cell.getCellStyle().getDataFormat();
+
+ if (HSSFDateUtil.isValidExcelDate(cell.getNumericCellValue())) {
+ if (HSSFDateUtil.isCellDateFormatted(cell)) {
+ return true;
+ } else {
+ String fmtText = mDataFormat.getFormat(format);
+
+ if (fmtText != null) {
+ fmtText = fmtText.toLowerCase();
+
+ if (fmtText.indexOf("d") >= 0
+ || fmtText.indexOf("m") >= 0
+ || fmtText.indexOf("y") >= 0
+ || fmtText.indexOf("h") >= 0
+ || fmtText.indexOf("s") >= 0) {
+ return true;
+ }
+ }
+ }
+ }
+
+ return false;
+ }
+
+ /**
+ * It will replace all occurances of pattern in the source with replacement value
+ * @param source
+ * @param pattern
+ * @param replacement
+ * @return String
+ */
+ public static String replace(String source, String pattern, String replacement) {
+ // Check whether the pattern occurs in the source at all
+ int firstPatternPos = source.indexOf(pattern);
+ if (firstPatternPos == -1) {
+ // The pattern does not occur in the source -> return the source
+ return source;
+ }
+
+ // Build a new String where pattern is replaced by the replacement
+ StringBuffer target = new StringBuffer(source.length());
+ int start = 0; // The start of a part without the pattern
+ int end = firstPatternPos; // The end of a part without the pattern
+ do {
+ target.append(source.substring(start, end));
+ target.append(replacement);
+ start = end + pattern.length();
+ } while ((end = source.indexOf(pattern, start)) != -1);
+ target.append(source.substring(start, source.length()));
+
+ // return the String
+ return target.toString();
+ }
+
+}
Property changes on: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/textextraction/MSExcelTextExtractor.java
___________________________________________________________________
Name: svn:eol-style
+ native
Added: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/textextraction/MSPowerPointTextExtractor.java
===================================================================
--- xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/textextraction/MSPowerPointTextExtractor.java 2006-12-06 21:50:31 UTC (rev 1689)
+++ xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/textextraction/MSPowerPointTextExtractor.java 2006-12-06 22:29:02 UTC (rev 1690)
@@ -0,0 +1,23 @@
+/**
+ * Created by IntelliJ IDEA.
+ * User: lokesh
+ * Date: Oct 17, 2006
+ * Time: 3:31:38 PM
+ * To change this template use File | Settings | File Templates.
+ */
+
+package com.xpn.xwiki.plugin.lucene.textextraction;
+import org.apache.poi.hslf.extractor.PowerPointExtractor;
+
+import java.io.ByteArrayInputStream;
+
+/**
+ * Text extractor for Microsoft Power Point files.
+ */
+public class MSPowerPointTextExtractor implements MimetypeTextExtractor {
+
+ public String getText(byte[] data) throws Exception {
+ PowerPointExtractor ppe = new PowerPointExtractor(new ByteArrayInputStream(data));
+ return ppe.getText(true, true);
+ }
+}
Property changes on: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/textextraction/MSPowerPointTextExtractor.java
___________________________________________________________________
Name: svn:eol-style
+ native
Added: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/textextraction/MSWordTextExtractor.java
===================================================================
--- xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/textextraction/MSWordTextExtractor.java 2006-12-06 21:50:31 UTC (rev 1689)
+++ xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/textextraction/MSWordTextExtractor.java 2006-12-06 22:29:02 UTC (rev 1690)
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2004 Outerthought bvba and Schaubroeck nv
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.xpn.xwiki.plugin.lucene.textextraction;
+import java.io.ByteArrayInputStream;
+
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.usermodel.Range;
+
+/**
+ * Text extractor for Microsoft Word files.
+ */
+public class MSWordTextExtractor implements MimetypeTextExtractor {
+ public String getText(byte[] data) throws Exception {
+ HWPFDocument wordDoc = new HWPFDocument(new ByteArrayInputStream(data));
+ Range range = wordDoc.getRange();
+ return range.text();
+ }
+}
Property changes on: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/textextraction/MSWordTextExtractor.java
___________________________________________________________________
Name: svn:eol-style
+ native
Added: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/textextraction/MimetypeTextExtractor.java
===================================================================
--- xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/textextraction/MimetypeTextExtractor.java 2006-12-06 21:50:31 UTC (rev 1689)
+++ xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/textextraction/MimetypeTextExtractor.java 2006-12-06 22:29:02 UTC (rev 1690)
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2004 Outerthought bvba and Schaubroeck nv
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Changelog:
+ * jk at jkraemer.net: changed visibility of getText from package to public
+ */
+package com.xpn.xwiki.plugin.lucene.textextraction;
+
+/**
+ * A text extractor for a specific mime type.
+ */
+public interface MimetypeTextExtractor {
+
+ public String getText(byte[] data) throws Exception;
+}
Property changes on: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/textextraction/MimetypeTextExtractor.java
___________________________________________________________________
Name: svn:eol-style
+ native
Added: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/textextraction/OpenOfficeTextExtractor.java
===================================================================
--- xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/textextraction/OpenOfficeTextExtractor.java 2006-12-06 21:50:31 UTC (rev 1689)
+++ xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/textextraction/OpenOfficeTextExtractor.java 2006-12-06 22:29:02 UTC (rev 1690)
@@ -0,0 +1,103 @@
+/*
+ * Copyright 2004 Outerthought bvba and Schaubroeck nv
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.xpn.xwiki.plugin.lucene.textextraction;
+import java.io.ByteArrayInputStream;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
+
+import org.xmlpull.mxp1.MXParser;
+import org.xmlpull.v1.XmlPullParser;
+import com.xpn.xwiki.plugin.lucene.textextraction.xmlutil.XmlEncodingDetector;
+
+/**
+ * Extracts all text from an OpenOffice document.
+ */
+public class OpenOfficeTextExtractor implements MimetypeTextExtractor {
+ private static final String TEXTNAMESPACE="http://openoffice.org/2000/text";
+
+ public String getText(byte[] data) throws Exception {
+ /*
+ * the byte array we receive here is in fact a ZIP containing the
+ * content.xml, styles.xml,meta.xml and META-INF/manifest.xml files. We
+ * are only interested in the content.xml because that's the file
+ * containing the actual content (duh)
+ */
+
+ ByteArrayInputStream bis = new ByteArrayInputStream(data);
+ ZipInputStream zis = new ZipInputStream(bis);
+
+ ZipEntry ze = null;
+ String zipEntryName = null;
+ StringBuffer text = new StringBuffer();
+
+ while ((ze = zis.getNextEntry()) != null
+ && !(zipEntryName = ze.getName()).equals("content.xml")) {
+ }
+
+ if (zipEntryName != null && zipEntryName.equals("content.xml")) {
+ /*
+ * we found the correct zip entry. This means the "read pointer" of
+ * the zipinputstream points correctly to the beginning of this zip
+ * entry and we can pass it to the xml parser like this (will
+ * return -1 as soon as the end of the zip entry is reached)
+ */
+
+ /* We are using this XmlPullParser because it was impossible to work
+ * with a sax parser. The sax parser always wanted to have access to the
+ * openoffice dtd. Even tried to write our own entityresolver to work
+ * around this problem but didnt work out. In order not to pin ourselves
+ * down to a specific sax implementor (where we eg. would be able to
+ * specify that we explicitly don't want any check at all against a dtd)
+ * we choose not to use sax at all and use a very lightweight type of
+ * parsing for this specific goal.
+ */
+
+ XmlPullParser parser = new MXParser();
+ parser.setFeature(XmlPullParser.FEATURE_PROCESS_NAMESPACES, true);
+ parser.setInput(zis, XmlEncodingDetector.detectEncoding(data));
+ boolean inText = false;
+
+ int eventType = parser.getEventType();
+ while (eventType != XmlPullParser.END_DOCUMENT)
+ {
+ eventType = parser.next();
+ if (eventType == XmlPullParser.START_TAG)
+ {
+ if (parser.getName().equals("p") &&
+ parser.getNamespace().equals(TEXTNAMESPACE)) {
+ text.append(' ');
+ inText = true;
+ }
+ } else if (eventType == XmlPullParser.END_TAG) {
+ if (parser.getName().equals("p") &&
+ parser.getNamespace().equals(TEXTNAMESPACE)) {
+ inText = false;
+ }
+ } else if (eventType == XmlPullParser.TEXT) {
+ if (inText) {
+ String gotText = parser.getText();
+ text.append(gotText);
+ }
+ }
+ }
+
+ } else {
+ throw new Exception("Invalid OpenOffice document format (content.xml not found)");
+ }
+
+ return text.toString();
+ }
+}
Property changes on: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/textextraction/OpenOfficeTextExtractor.java
___________________________________________________________________
Name: svn:eol-style
+ native
Added: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/textextraction/PDFTextExtractor.java
===================================================================
--- xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/textextraction/PDFTextExtractor.java 2006-12-06 21:50:31 UTC (rev 1689)
+++ xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/textextraction/PDFTextExtractor.java 2006-12-06 22:29:02 UTC (rev 1690)
@@ -0,0 +1,43 @@
+/*
+ * Copyright 2004 Outerthought bvba and Schaubroeck nv
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.xpn.xwiki.plugin.lucene.textextraction;
+import org.pdfbox.pdmodel.PDDocument;
+import org.pdfbox.pdfparser.PDFParser;
+import org.pdfbox.util.PDFTextStripper;
+
+import java.io.ByteArrayInputStream;
+import java.io.CharArrayWriter;
+
+public class PDFTextExtractor implements MimetypeTextExtractor {
+ public String getText(byte[] data) throws Exception {
+ PDDocument pdfDocument = null;
+ try {
+ PDFParser parser = new PDFParser(new ByteArrayInputStream(data));
+ parser.parse();
+
+ pdfDocument = parser.getPDDocument();
+
+ CharArrayWriter writer = new CharArrayWriter();
+ PDFTextStripper stripper = new PDFTextStripper();
+ stripper.writeText(pdfDocument, writer);
+
+ return writer.toString();
+ } finally {
+ if( pdfDocument != null )
+ pdfDocument.close();
+ }
+ }
+}
Property changes on: xwiki/trunk/core/src/main/java/com/xpn/xwiki/plugin/lucene/textextraction/PDFTextExtractor.java
___________________________________________________________________
Name: svn:eol-style
+ nativ