import websphinx.*;
import java.lang.reflect.*;
import java.sql.*;

// esta clase es la que ejecuta
public class EjecutarGooglito {
	public EjecutarGooglito() {
	}
	public static void main(String[] args) throws java.net.MalformedURLException {
		GooglitoCrawl myCrawler=new GooglitoCrawl();
		Link myLink=new Link("http://en.wikipedia.org/wiki/Video_game");
		DownloadParameters myParameters = myCrawler.getDownloadParameters();

		// parametros del algoritmo
		myCrawler.setRoot(myLink);
		myCrawler.addClassifier(new StandardClassifier());
		myCrawler.addLinkListener(new EventLog());
		myCrawler.setMaxDepth(10); // 10 de profundidad
		myCrawler.setDepthFirst(false); //breadth-first
		myCrawler.setDomain(Crawler.SUBTREE); // busqueda solo sub arbol
		myCrawler.setLinkType(Crawler.HYPERLINKS); // busqueda de solo hyperlink
		myCrawler.setIgnoreVisitedLinks(true); // solo visita 1 vez cada links
		

		// parametros de carga del sistema
		myParameters=myParameters.changeMaxPageSize(10000); // limite de tamao de pagina en kb
		myParameters=myParameters.changeUserAgent("Mozilla/4.0 (compatible; MSIE 4.0; Windows NT)"); // cambia el agente para que no nos pillen
		myCrawler.setDownloadParameters(myParameters); 

		// Aqui corre Googlito hasta que no queden mas links diferentes
		myCrawler.run();
	}
}

// Esta es la clase que hace el crawling
class GooglitoCrawl extends Crawler {
	private int MAXPAGES=1000;
	private GooglitoRepository repository=new GooglitoRepository();

	// se definen que links se deben visitar
	public boolean shouldVisit(Link l) {
        	String the_url = l.getPageURL().toString();
		if (!the_url.matches(".*(\\.mov|\\.jpg|\\.gif|\\.pdf|\\.php\\S*)$") && 
			the_url.matches("http://en.wikipedia.org/wiki/\\S*$")) {
			//System.out.println("VISITAMOS: " + the_url);
        		return true;
		} else {
			//System.out.println("NO VISITAREMOS: " + the_url);
			return false;
		}
	}
    
	// Aqui se programa que se hace cuando se visita una pagina
	public void visit(Page page) {
		Text[] words;
		Link[] links;
		int k, id_page;

		//System.out.println("Cargando: " + page.getTitle());
		
		// obtenemos los links
		links=page.getLinks();

		for (k=0; k<links.length; k++) {
			//System.out.println("*** links: " + links[k].getPageURL().toString());
			; // Aqui hay que insertar los link en la tabla de links
		}


//******************** PROCESADOR DE TEXTO ********************************
		id_page=repository.insertPage(page.getURL().toString());
		// obtenemos las palabras
		if (!page.isParsed()) { // si no esta parseada la parseamos
			page.parse(new HTMLParser());
		}
		words=page.getWords(); // obtenemos entonces las palabras del texto
		for (k=0; k<words.length; k++) {
			//System.out.println("*** words: " + words[k].toText());
			repository.insertWord(words[k].toText(), id_page);
		}

//*************************************************************************


		// Un poco de aseo de la memoria
		page.getOrigin().setPage(null);
		page.discardContent();

		// Algunas estadisticas cada 10 visitas.
		int n = this.getPagesVisited();
		if (n % 10 == 0) 
			System.out.println(this.getPagesVisited() + " pages visited.  " + this.getPagesLeft() + " pages left.  " + this.getActiveThreads() + " active threads.");
		if (n>MAXPAGES) // Si sobrepasa el maximo de paginas paramos
			this.stop();
	}
}

class GooglitoRepository {

	private Connection conn;
	private Statement stmt;
	private ResultSet rs;

	// y ademas carga la coneccion
	public GooglitoRepository() { 
		try {
			// cargando el driver
			Class.forName("com.mysql.jdbc.Driver").newInstance();

			// conectando
			conn = DriverManager.getConnection("jdbc:mysql://localhost/googlito?" + "user=root&password=");

		} catch (SQLException ex) {
			System.out.println("SQLException: " + ex.getMessage());
			System.out.println("SQLState: " + ex.getSQLState());
			System.out.println("VendorError: " + ex.getErrorCode());
		} catch (ClassNotFoundException ex) {
			System.out.println("Class not Found Exception"+ ex.getMessage());
		} catch (Exception ex) {
			//System.out.println("Found Exception "+ ex.getMessage());
		}

	}

	// Inserta una palabra clave
	public void insertWord(String word, int id_page) {
		int id_word=-1;
		String query;
		try {
			stmt = conn.createStatement();

			query = "INSERT IGNORE INTO keyword (word) values (\""+addSlashes(word)+"\")"; 
			stmt.execute(query);
			
			query = "select id from keyword where word='"+addSlashes(word)+"'"; 
			rs=stmt.executeQuery(query);
			if (rs.next())
				id_word=rs.getInt("id");

			query = "INSERT IGNORE INTO keyword_page (id_word, id_page) values ("+id_word+", "+id_page+")"; 
			stmt.execute(query);
			stmt.close();
		} catch (SQLException ex) {
			System.out.println("SQLException: " + ex.getMessage());
			System.out.println("SQLState: " + ex.getSQLState());
			System.out.println("VendorError: " + ex.getErrorCode());
		}
	}

	// Inserta una palabra clave
	public int insertPage(String pageURL) {
		int new_id_page=-1;
		String query;
		try {
			stmt = conn.createStatement();

			query = "INSERT IGNORE INTO page (url) values (\""+addSlashes(pageURL)+"\")"; 
			stmt.execute(query);
			
			query = "select id from page where url=\""+addSlashes(pageURL)+"\""; 
			rs=stmt.executeQuery(query);
			if (rs.next()) {
				new_id_page=rs.getInt("id");
			}

       		stmt.close();
		} catch (SQLException ex) {
			System.out.println("SQLException: " + ex.getMessage());
			System.out.println("SQLState: " + ex.getSQLState());
			System.out.println("VendorError: " + ex.getErrorCode());
		}
		return(new_id_page);
	}


	// Utiles
	public String addSlashes(String str){
		if(str==null) return "";

		StringBuffer s = new StringBuffer ((String) str);
		for (int i = 0; i < s.length(); i++)
			if (s.charAt (i) == '\"')
				s.insert (i++, '\\');
		return s.toString();

	}

} 