otis 2002/06/21 08:02:51 Added: contributions/XML-Indexing-Demo IndexingRequest.xml contributions/XML-Indexing-Demo/src/java/org/apache/lucenesandbox/xmlindexingdemo IndexFiles.java SearchFiles.java XMLDocumentHandlerDOM.java XMLDocumentHandlerSAX.java Log: - Unpacked the .zip file, for easier access to the source code. Revision Changes Path 1.1 jakarta-lucene-sandbox/contributions/XML-Indexing-Demo/IndexingRequest.xml Index: IndexingRequest.xml =================================================================== <customerInfo> <name><![CDATA[Aruna A. Raghavan]]></name> <profession><![CDATA[Software Developer]]></profession> <addressLine1><![CDATA[6801 West 106th Street]]></addressLine1> <addressLine2><![CDATA[#205]]></addressLine2> <city><![CDATA[Eagan]]></city> <state><![CDATA[MN]]></state> <zip><![CDATA[55121]]></zip> <country><![CDATA[USA]]></country> </customerInfo> 1.1 jakarta-lucene-sandbox/contributions/XML-Indexing-Demo/src/java/org/apache/lucenesandbox/xmlindexingdemo/IndexFiles.java Index: IndexFiles.java =================================================================== package org.apache.lucenesandbox.xmlindexingdemo; /* ==================================================================== * The Apache Software License, Version 1.1 * * Copyright (c) 2001 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. The end-user documentation included with the redistribution, * if any, must include the following acknowledgment: * "This product includes software developed by the * Apache Software Foundation (http://www.apache.org/)." * Alternately, this acknowledgment may appear in the software itself, * if and wherever such third-party acknowledgments normally appear. * * 4. The names "Apache" and "Apache Software Foundation" and * "Apache Lucene" must not be used to endorse or promote products * derived from this software without prior written permission. For * written permission, please contact [EMAIL PROTECTED] * * 5. Products derived from this software may not be called "Apache", * "Apache Lucene", nor may "Apache" appear in their name, without * prior written permission of the Apache Software Foundation. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation. For more * information on the Apache Software Foundation, please see * <http://www.apache.org/>. */ import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.index.IndexWriter; import java.io.File; import java.util.Date; class IndexFiles { public static void main(String[] args) throws Exception { try { Date start = new Date(); IndexWriter writer = new IndexWriter("index", new StandardAnalyzer(), true); indexDocs(writer, new File(args[0])); writer.optimize(); writer.close(); Date end = new Date(); System.out.print(end.getTime() - start.getTime()); System.out.println(" total milliseconds"); } catch (Exception e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); throw e; } } public static void indexDocs(IndexWriter writer, File file) throws Exception { if (file.isDirectory()) { String[] files = file.list(); for (int i = 0; i < files.length; i++) indexDocs(writer, new File(file, files[i])); } else { System.out.println("adding " + file); XMLDocumentHandlerSAX hdlr = new XMLDocumentHandlerSAX(file); writer.addDocument(hdlr.getDocument()); // For DOM, use // XMLDocumentHandlerDOM hdlr = new XMLDocumentHandlerDOM(); // writer.addDocument(hdlr.createXMLDocument(file)); } } } 1.1 jakarta-lucene-sandbox/contributions/XML-Indexing-Demo/src/java/org/apache/lucenesandbox/xmlindexingdemo/SearchFiles.java Index: SearchFiles.java =================================================================== package org.apache.lucenesandbox.xmlindexingdemo; /* ==================================================================== * The Apache Software License, Version 1.1 * * Copyright (c) 2001 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. The end-user documentation included with the redistribution, * if any, must include the following acknowledgment: * "This product includes software developed by the * Apache Software Foundation (http://www.apache.org/)." * Alternately, this acknowledgment may appear in the software itself, * if and wherever such third-party acknowledgments normally appear. * * 4. The names "Apache" and "Apache Software Foundation" and * "Apache Lucene" must not be used to endorse or promote products * derived from this software without prior written permission. For * written permission, please contact [EMAIL PROTECTED] * * 5. Products derived from this software may not be called "Apache", * "Apache Lucene", nor may "Apache" appear in their name, without * prior written permission of the Apache Software Foundation. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation. For more * information on the Apache Software Foundation, please see * <http://www.apache.org/>. */ import java.io.IOException; import java.io.BufferedReader; import java.io.InputStreamReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.search.Searcher; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.Hits; import org.apache.lucene.queryParser.QueryParser; class SearchFiles { public static void main(String[] args) { try { Searcher searcher = new IndexSearcher("index"); Analyzer analyzer = new StandardAnalyzer(); BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); while (true) { System.out.print("Query: "); String line = in.readLine(); if (line.length() == -1) break; Query query = QueryParser.parse(line, "name", analyzer); System.out.println("Searching for: " + query.toString("name")); Hits hits = searcher.search(query); System.out.println(hits.length() + " total matching documents"); final int HITS_PER_PAGE = 10; for (int start = 0; start < hits.length(); start += HITS_PER_PAGE) { int end = Math.min(hits.length(), start + HITS_PER_PAGE); for (int i = start; i < end; i++) { Document doc = hits.doc(i); String name = doc.get("name"); System.out.println(name); System.out.println(doc.get("profession")); System.out.println(doc.get("addressLine1")); System.out.println(doc.get("addressLine2")); System.out.print(doc.get("city")); System.out.print(" "); System.out.print(doc.get("state")); System.out.print(" "); System.out.print(doc.get("zip")); System.out.println(doc.get("country")); } if (hits.length() > end) { System.out.print("more (y/n) ? "); line = in.readLine(); if (line.length() == 0 || line.charAt(0) == 'n') break; } } } searcher.close(); } catch (Exception e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } } } 1.1 jakarta-lucene-sandbox/contributions/XML-Indexing-Demo/src/java/org/apache/lucenesandbox/xmlindexingdemo/XMLDocumentHandlerDOM.java Index: XMLDocumentHandlerDOM.java =================================================================== package org.apache.lucenesandbox.xmlindexingdemo; import org.w3c.dom.*; import org.w3c.dom.Node; import javax.xml.parsers.*; import org.apache.lucene.document.Field; import java.io.File; /** * */ public class XMLDocumentHandlerDOM { public org.apache.lucene.document.Document createXMLDocument(File f) { org.apache.lucene.document.Document document = new org.apache.lucene.document.Document(); DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); try { DocumentBuilder df = dbf.newDocumentBuilder(); org.w3c.dom.Document d = df.parse(f); Node root = d.getDocumentElement(); traverseTree(root, document); } catch (Exception e) { System.out.println("error: " + e); e.printStackTrace(); } return document; } static private void traverseTree(Node node, org.apache.lucene.document.Document document) { NodeList nl = node.getChildNodes(); if (nl.getLength() == 0) { if (node.getNodeType() == Node.TEXT_NODE) { Node parentNode = node.getParentNode(); if (parentNode.getNodeType() == Node.ELEMENT_NODE) { String parentNodeName = parentNode.getNodeName(); // String nodeValue = node.getNodeValue(); // if (parentNodeName.equals("name")) // { Node siblingNode = node.getNextSibling(); if (siblingNode != null) { if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE) { document.add(Field.Text("name", siblingNode.getNodeValue())); } } // } // else if (parentNodeName.equals("profession")) // { // Node siblingNode = node.getNextSibling(); // if (siblingNode != null) // { // if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE) // { // document.add(Field.Text([arentNodeName, siblingNode.getNodeValue())); // } // } // } // else if (parentNodeName == "addressLine1") // { // Node siblingNode = node.getNextSibling(); // if(siblingNode != null) // { // if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE) // { // document.add(Field.Text("addressLine1", siblingNode.getNodeValue())); // } // } // } // else if (parentNodeName.equals("addressLine2")) // { // Node siblingNode = node.getNextSibling(); // if (siblingNode != null) // { // if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE) // { // document.add(Field.Text("addressLine2", siblingNode.getNodeValue())); // } // } // } // if (parentNodeName.equals("city")) // { // Node siblingNode = node.getNextSibling(); // if (siblingNode != null) // { // if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE) // { // document.add(Field.Text("city", siblingNode.getNodeValue())); // } // } // } // else if (parentNodeName.equals("zip")) // { // Node siblingNode = node.getNextSibling(); // if (siblingNode != null) // { // if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE) // { // document.add(Field.Text("zip", siblingNode.getNodeValue())); // } // } // } // else if (parentNodeName.equals("state")) // { // Node siblingNode = node.getNextSibling(); // if (siblingNode != null) // { // if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE) // { // document.add(Field.Text("state", siblingNode.getNodeValue())); // } // } // } // else if (parentNodeName.equals("country")) // { // Node siblingNode = node.getNextSibling(); // if (siblingNode != null) // { // if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE) // { // document.add(Field.Text("country", siblingNode.getNodeValue())); // } // } // } } } } else { for(int i=0; i<nl.getLength(); i++) { traverseTree(nl.item(i), document); } } } } 1.1 jakarta-lucene-sandbox/contributions/XML-Indexing-Demo/src/java/org/apache/lucenesandbox/xmlindexingdemo/XMLDocumentHandlerSAX.java Index: XMLDocumentHandlerSAX.java =================================================================== package org.apache.lucenesandbox.xmlindexingdemo; import org.xml.sax.*; import org.xml.sax.helpers.*; import org.xml.sax.AttributeList; import javax.xml.parsers.*; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import java.io.File; import java.io.IOException; public class XMLDocumentHandlerSAX extends HandlerBase { /** A buffer for each XML element */ private StringBuffer elementBuffer = new StringBuffer(); private Document mDocument; // constructor public XMLDocumentHandlerSAX(File xmlFile) throws ParserConfigurationException, SAXException, IOException { SAXParserFactory spf = SAXParserFactory.newInstance(); SAXParser parser = spf.newSAXParser(); parser.parse(xmlFile, this); } // call at document start public void startDocument() { mDocument = new Document(); } // call at element start public void startElement(String localName, AttributeList atts) throws SAXException { elementBuffer.setLength(0); } // call when cdata found public void characters(char[] text, int start, int length) { elementBuffer.append(text, start, length); } // call at element end public void endElement(String localName) throws SAXException { mDocument.add(Field.Text(localName, elementBuffer.toString())); } public Document getDocument() { return mDocument; } }
-- To unsubscribe, e-mail: <mailto:[EMAIL PROTECTED]> For additional commands, e-mail: <mailto:[EMAIL PROTECTED]>