/*
 * WebSPHINX web crawling toolkit
 * Copyright (C) 1998,1999 Carnegie Mellon University 
 * 
 * This library is free software; you can redistribute it
 * and/or modify it under the terms of the GNU Library
 * General Public License as published by the Free Software 
 * Foundation, version 2.
 *
 * WebSPHINX homepage: http://www.cs.cmu.edu/~rcm/websphinx/
 */
package websphinx.searchengine;

import websphinx.*;
import java.net.URL;
import java.net.URLEncoder;
import java.net.MalformedURLException;

/**
 * <A href="http://www.metacrawler.com/">MetaCrawler</a> search engine.
 */
public class MetaCrawler implements SearchEngine {

    static Pattern patCount = new Regexp (
        "Collated Results: 1 to \\d+ of (\\d+) references"
    );
    static Pattern patNoHits = new Regexp (
        "Your search did not produce any results"
    );

    static Pattern patResult = new Tagexp (
        "<dt><font color=#000000><b>(?{relevance})</b></font>"  // relevance rating
      + "(?{link}(?{title}<a>.*?</a>))"           // title and main link
      + "(?{description}<dt>.*?<font>)"      // description
    );

    //static Pattern patMoreLink = new Regexp (
    //    "<a href=\"http://\\w+.metacrawler.com/crawler\\?general.*?\">\\d+</a>"
    //);
    static Pattern patMoreLink = new Tagexp (
         "<a href=http://*.metacrawler.com/crawler\\?general*></a>"
    );

    /**
     * Classify a page.  Sets the following labels:
     * <TABLE>
     * <TR><TH>Name <TH>Type  <TH>Meaning
     * <TR><TD>searchengine.source <TD>Page label <TD>MetaCrawler object that labeled the page
     * <TR><TD>searchengine.count <TD>Page field <TD>Number of results on page
     * <TR><TD>searchengine.results <TD>Page fields <TD>Array of results.  Each result region
     * contains subfields: rank, title, description, and link.
     * <TR><TD>searchengine.more-results <TD>Link label <TD>Link to a page containing more results.
     * </TABLE>
     */
    public void classify (Page page) {
        String title = page.getTitle ();
        if (title != null && title.startsWith ("Metacrawler query:")) {
            page.setObjectLabel ("searchengine.source", this);

            Region count = patCount.oneMatch (page);
            if (count != null)
                page.setField ("searchengine.count", count.getField ("0"));
            
            Region[] results = patResult.allMatches (page);
            SearchEngineResult[] ser = new SearchEngineResult[results.length];
            for (int i=0; i<results.length; ++i)
                ser[i] = new SearchEngineResult (results[i]);
            page.setFields ("searchengine.results", ser);

            PatternMatcher m = patMoreLink.match (page);
            while (m.hasMoreElements ()) {
                Link link = (Link)m.nextMatch ();
                link.setLabel ("searchengine.more-results");
                link.setLabel ("hyperlink");             
            }
        }
    }

    /**
     * Priority of this classifier.
     */
    public static final float priority = 0.0F;
    
    /**
     * Get priority of this classifier.
     * @return priority.
     */
    public float getPriority () {
        return priority;
    }

    /**
     * Make a query URL for MetaCrawler.
     * @param keywords list of keywords, separated by spaces
     * @return URL that submits the keywords to MetaCrawler.
     */
    public URL makeQuery (String keywords) {
        try {
            return new URL("http://www.metacrawler.com/crawler?general="
                         + URLEncoder.encode(keywords)
                         + "&method=1&format=1&region=&rpp=20&timeout=15&hpe=10");
        } catch (MalformedURLException e) {
            throw new RuntimeException ("internal error");
        }
    }

    /**
     * Get number of results per page for this search engine.
     * @return typical number of results per page
     */
    public int getResultsPerPage () {
        return 20;
    }

    /**
     * Search MetaCrawler.
     * @param keywords list of keywords, separated by spaces
     * @return enumeration of SearchEngineResults returned by an MetaCrawler query constructed from the keywords.
     */
    public static Search search (String keywords) {
        return new Search (new MetaCrawler(), keywords);
    }

    /**
     * Search MetaCrawler.
     * @param keywords list of keywords, separated by spaces
     * @param maxResults maximum number of results to return
     * @return enumeration of SearchEngineResults returned by an MetaCrawler query constructed from the keywords.
     * The enumeration yields at most maxResults objects.
     */
    public static Search search (String keywords, int maxResults) {
        return new Search (new MetaCrawler(), keywords, maxResults);
    }
} 
