package com.bitmechanic.spindle; /* * This file is a derived version of a file from Spindle, (c) bitmechanic.com. * * This version, based on Spindle 0.90, with fixes and enhancements added * by Richard Dallaway. * * Spindle is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * Spindle is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Spindle; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.io.StringReader; import java.net.HttpURLConnection; import java.net.URL; import java.security.Security; import java.util.ArrayList; import java.util.Enumeration; import java.util.HashMap; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import cvu.html.HTMLTokenizer; import cvu.html.TagToken; import cvu.html.TextToken; /** * Provides a HTTPD crawler which builds a Lucene index for searching. * * @version $Revision: 1.3 $ $Date: 2004/06/20 21:34:51 $ */ public class Spider implements Runnable { private static String lineSep = System.getProperty("line.separator"); private String indexDir; private ArrayList urls; private ArrayList include; private ArrayList exclude; private ArrayList threadList; /** List (possibly zero length) of tags to include in the page description. */ private ArrayList descriptionTags; /** Treat documents contanining a name sections as separate documents in the index. */ private boolean eachNameIsADocument; /** The pattern to look for when splitting a document based on tags. */ private final Pattern aNamePattern = Pattern.compile("a name=(\"|')([^'\" ]+)", Pattern.CASE_INSENSITIVE); private boolean verbose; private boolean incremental; private boolean groksHTTPS; private IndexWriter index; private HashMap indexedURLs; private HashMap mimeTypes; private int threads; private int descSize; private int bytes; public static void main(String argv[]) throws Exception { Spider s = new Spider(argv); s.go(); } public Spider(String argv[]) { groksHTTPS = true; verbose = false; incremental = false; eachNameIsADocument = false; threads = 2; descSize = 256; bytes = 0; include = new ArrayList(); exclude = new ArrayList(); urls = new ArrayList(); threadList = new ArrayList(); descriptionTags = new ArrayList(); indexedURLs = new HashMap(); mimeTypes = new HashMap(); parseArgs(argv); } public void go() throws Exception { // create the index directory -- or append to existing if (verbose) { print("Creating index in: " + indexDir); if (incremental) print(" - using incremental mode"); } index = new IndexWriter( new File(indexDir), new StandardAnalyzer(), !incremental); // check if we can do https URLs try { System.setProperty( "java.protocol.handler.pkgs", "com.sun.net.ssl.internal.www.protocol"); Security.addProvider(new com.sun.net.ssl.internal.ssl.Provider()); URL url = new URL("https://www.bitmechanic.com/"); } catch (Exception e) { groksHTTPS = false; if (verbose) print("Disabling support for https URLs"); } // index each entry point URL long start = System.currentTimeMillis(); for (int i = 0; i < threads; i++) { Thread t = new Thread(this, "Spindle Spider Thread #" + (i + 1)); t.start(); threadList.add(t); } while (threadList.size() > 0) { Thread child = (Thread) threadList.remove(0); child.join(); } long elapsed = System.currentTimeMillis() - start; // save the index if (verbose) { print( "Indexed " + indexedURLs.size() + " URLs (" + (bytes / 1024) + " KB) in " + (elapsed / 1000) + " seconds"); print("Optimizing index"); } index.optimize(); index.close(); } public void run() { String url; try { while ((url = dequeueURL()) != null) { indexURL(url); } } catch (Exception e) { e.printStackTrace(); } threads--; } public synchronized String dequeueURL() throws Exception { while (true) { if (urls.size() > 0) { return (String) urls.remove(0); } else { threads--; if (threads > 0) { wait(); threads++; } else { notifyAll(); return null; } } } } public synchronized void enqueueURL(String url) { if (indexedURLs.get(url) == null) { urls.add(url); indexedURLs.put(url, Boolean.TRUE); notifyAll(); } } private void indexURL(String url) throws Exception { if (verbose) print( " " + Thread.currentThread().getName() + ": Adding URL: " + url); final URLSummary[] summaryList = loadURL(url); if (summaryList != null) { for( int s=0, n=summaryList.length; s 0); HTMLTokenizer ht = new HTMLTokenizer(new StringReader(summary.body)); for (Enumeration e = ht.getTokens(); e.hasMoreElements();) { Object obj = e.nextElement(); if (obj instanceof TagToken) { TagToken tag = (TagToken) obj; String tagName = tag.getName().toLowerCase(); if (watchDescriptionTags && isDescriptionTag(tagName)) { inDescriptionTag = !tag.isEndTag(); } String url = null; if (tagName.equals("a")) { url = tag.getAttributes().get("href"); } else if (tagName.equals("frame")) { url = tag.getAttributes().get("src"); } else if (tagName.equals("title") && e.hasMoreElements() && !tag.isEndTag()) { obj = e.nextElement(); if (obj instanceof TextToken) { TextToken title = (TextToken) obj; summary.title = title.getText(); } } if (url != null) { if (url.startsWith("http://") || (url.startsWith("https://") && groksHTTPS)) { // verify we're on the same host and port URL u = new URL(url); if (u.getHost().equals(summary.url.getHost()) && u.getPort() == summary.url.getPort()) { url = chopOffNamedAnchor(url); if (indexedURLs.get(url) == null) urls.add(url); } } else if ( url.indexOf("://") == -1 && !url.startsWith("mailto:") && !url.startsWith("#") && !url.startsWith("javascript:")) { // parse relative url url = formURL(summary.url, url); url = chopOffNamedAnchor(url); if (indexedURLs.get(url) == null) urls.add(url); } } } else if ((obj instanceof TextToken) && (inDescriptionTag || !watchDescriptionTags)) { // ... the !watchDescriptionTags part returns this code to default behaviour of using // all text as part of the description TextToken t = (TextToken) obj; String text = t.getText(); if (text != null && text.trim().length() > 0) desc.append(text.trim()).append(" "); } } if (desc.length() > descSize) desc.setLength(descSize); summary.desc = desc.toString(); String list[] = new String[urls.size()]; urls.toArray(list); return list; } private String chopOffNamedAnchor(final String url) { int pos = url.indexOf("#"); if (pos == -1) return url; else return url.substring(0, pos); } // converts relative URL to absolute URL private String formURL(final URL origURL, String newURL) { StringBuffer base = new StringBuffer(origURL.getProtocol()); base.append("://").append(origURL.getHost()); if (origURL.getPort() != -1) { base.append(":").append(origURL.getPort()); } if (newURL.startsWith("/")) { base.append(newURL); } else if (newURL.startsWith("..")) { String file = origURL.getFile(); } else { String file = origURL.getFile(); int pos = file.lastIndexOf("/"); if (pos != -1) file = file.substring(0, pos); while (newURL.startsWith("../")) { pos = file.lastIndexOf("/"); file = file.substring(0, pos); newURL = newURL.substring(3); } base.append(file).append("/").append(newURL); } return base.toString(); } private URLSummary[] loadURL(final String url) throws Exception { final URL u = new URL(url); HttpURLConnection uc = null; String ct = ""; try { uc = (HttpURLConnection) u.openConnection(); uc.setAllowUserInteraction(false); if (uc.getResponseCode() != 200) return null; ct = uc.getContentType(); } catch (FileNotFoundException e) { // 404 print("Unexpected status code="+uc.getResponseCode()+" for "+url); return null; } // RZD: the mime type could contain the page encoding, such as "text/html;charset=ISO-8859-1" // and we're only interested in the part before the semicolon. int semiColon = ct.indexOf(';'); if (semiColon != -1) { ct = ct.substring(0, semiColon); } if (mimeTypes.get(ct) == null) { return null; } BufferedReader in = new BufferedReader(new InputStreamReader(uc.getInputStream())); StringBuffer body = new StringBuffer(2048); String line; while ((line = in.readLine()) != null) { body.append(line); body.append(lineSep); } in.close(); URLSummary summary = new URLSummary(); summary.url = u; summary.body = body.toString(); List summaries = new ArrayList(); // We either treat the page as a single document to index or, at the user's request, // split the page into many documents, where each document is defined by an tag. // Note the quick-and-dirty check for an tag :-( if (eachNameIsADocument == false || summary.body.indexOf(": we might want to think about folding // this into the HTML tokenization process somehow. Matcher matcher = aNamePattern.matcher(summary.body); int positionInDocument = 0; // the first fragment starts at the top of the document String name = null; // the first fragment has no name, because it has no tag. while (matcher.find()) { // Where the match was found int startOfMatch = matcher.start(); print(" Forming document fragment #"+name); final URLSummary fragment = new URLSummary(); if (name == null) { fragment.url = summary.url; } else { fragment.url = new URL(summary.url.toExternalForm() + "#" + name); } // This fragment runs from where-we've-read-to up to the tag // we've found: fragment.body = summary.body.substring(positionInDocument, startOfMatch); summaries.add(fragment); // The next fragment body will start from here... positionInDocument = startOfMatch; // ...and will have this name: name = matcher.group(2); // groups are indexed from 1 } // Finally fragement-ize from the end of the last fragment to the end of the document: print(" Forming document fragment #"+name); final URLSummary fragment = new URLSummary(); fragment.url = new URL(summary.url.toExternalForm() + "#" + name); fragment.body = summary.body.substring(positionInDocument); // to the end summaries.add(fragment); } return (URLSummary[])summaries.toArray(new URLSummary[summaries.size()]); } private void parseArgs(String argv[]) { for (int i = 0; i < argv.length; i++) { if (argv[i].equals("-u")) urls.add(argv[++i]); else if (argv[i].equals("-d")) indexDir = argv[++i]; else if (argv[i].equals("-i")) include.add(argv[++i]); else if (argv[i].equals("-e")) exclude.add(argv[++i]); else if (argv[i].equals("-v")) verbose = true; else if (argv[i].equals("-a")) incremental = true; else if (argv[i].equals("-m")) mimeTypes.put(argv[++i], Boolean.TRUE); else if (argv[i].equals("-t")) threads = Integer.parseInt(argv[++i]); else if (argv[i].equals("-s")) descSize = Integer.parseInt(argv[++i]); else if (argv[i].equals("-dt")) { String tag = argv[i+1]; if (tag != null) { descriptionTags.add(tag.toLowerCase()); } } else if (argv[i].equals("-n")) { eachNameIsADocument = true; System.out.println(" splitting is on"); } } if (urls.size() == 0) throw new IllegalArgumentException("Missing required argument: -u [start url]"); if (indexDir == null) throw new IllegalArgumentException("Missing required argument: -d [index dir]"); if (threads < 1) throw new IllegalArgumentException( "Invalid number of threads: " + threads); if (mimeTypes.size() == 0) { // add default MIME types mimeTypes.put("text/html", Boolean.TRUE); mimeTypes.put("text/plain", Boolean.TRUE); } } private void print(String str) { System.out.println(str); } }