/*
 * WebSPHINX web crawling toolkit
 * Copyright (C) 1998,1999 Carnegie Mellon University 
 * 
 * This library is free software; you can redistribute it
 * and/or modify it under the terms of the GNU Library
 * General Public License as published by the Free Software 
 * Foundation, version 2.
 *
 * WebSPHINX homepage: http://www.cs.cmu.edu/~rcm/websphinx/
 */
package websphinx;

import java.util.Vector;
import java.util.Enumeration;
import java.io.IOException;
//#ifdef JDK1.1
import java.io.ObjectInputStream;
//#endif JDK1.1

public class Regexp extends Pattern {

    static com.oroinc.text.regex.PatternCompiler compiler 
            = new com.oroinc.text.regex.Perl5Compiler ();

    String stringRep;
    transient com.oroinc.text.regex.Pattern pattern;
    transient String[] fields;

    public Regexp (String pattern) {
        stringRep = pattern;
        init ();
    }
    
    public boolean equals (Object object) {
        if (! (object instanceof Regexp))
            return false;
        Regexp p = (Regexp)object;
        return p.stringRep.equals (stringRep);
    }        
    
    //#ifdef JDK1.1
    private void readObject (ObjectInputStream in) 
           throws IOException, ClassNotFoundException {
        in.defaultReadObject ();
        init ();
    }
    //#endif JDK1.1

    
    private void init () {
        synchronized (compiler) {
            try {
                this.pattern = compiler.compile (translateFields (stringRep));
            } catch (com.oroinc.text.regex.MalformedPatternException e) {
                throw new RuntimeException ("syntax error in pattern: " + pattern);
            }
        }
    }
    
    public String[] getFieldNames () {
        return fields;
    }
    
    public String toString () {
        return stringRep;
    }

    public PatternMatcher match (Region region) {
        return new RegexpMatcher (this, region);
    }

    public static String escape (String s) {
        return websphinx.util.Str.escape (s, '\\', "\\^.$|()[]*+?{}");
    }

    String translateFields (String s) {
        Vector vfields = new Vector ();
        boolean inEscape = false;

        StringBuffer output = new StringBuffer ();

        int len = s.length ();
        for (int i=0; i<len; ++i) {
            char c = s.charAt (i);
            if (inEscape) {
                output.append (c);
                inEscape = false;
            }
            else {
                switch (c) {
                  case '\\':
                    output.append (c);
                    inEscape = true;
                    break;

                  case '(':
                    output.append (c);
                    if (s.startsWith ("?{", i+1)) {
                        int start = i+3;
                        int end = s.indexOf ('}', start);
                        vfields.addElement (s.substring (start, end));
                        i = end;
                    }
                    else if (!s.startsWith ("?", i+1))
                        vfields.addElement (String.valueOf (vfields.size()));
                    break;

                  default:
                    output.append (c);
                    break;
                }
            }
        }

        fields = new String[vfields.size()];
        vfields.copyInto (fields);
        return output.toString ();
    }
    
    public static void main (String[] args) throws Exception {
        if (args.length < 2) {
            System.err.println ("usage: Regexp <pattern> <source URL>*");
            return;
        }

        Pattern p = new Regexp (args[0].replace ('_', ' ') );
        for (int i=1; i<args.length; ++i) {
            Page page = new Page (new Link (args[i]));
            System.out.println ("--------------------" + args[i]);
            PatternMatcher m = p.match (page);
            for (Region r = m.nextMatch(); r != null; r = m.nextMatch()) {
                System.out.println ("[" + r.getStart() + "," + r.getEnd() + "]" + r);
                Enumeration enum = r.enumerateObjectLabels ();
                while (enum.hasMoreElements ()) {
                    String lbl = (String)enum.nextElement ();
                    Object object = r.getObjectLabel (lbl);
                    if (object instanceof Region) {
                        Region s = (Region)object;
                        System.out.println ("    "+lbl+"=[" + s.getStart() + "," + s.getEnd() + "]" + s);
                    }
                }
            }
        }
    }
}

class RegexpMatcher extends PatternMatcher {
    com.oroinc.text.regex.PatternMatcher matcher = new com.oroinc.text.regex.Perl5Matcher ();
    Regexp regexp;
    Region source;
    com.oroinc.text.regex.PatternMatcherInput input;

    public RegexpMatcher (Regexp regexp, Region source) {
        this.regexp = regexp;
        this.source = source;
        this.input = new com.oroinc.text.regex.PatternMatcherInput (source.getSource().getContent(),
                                                                   source.getStart(), source.getLength ());
    }

    protected Region findNext () {
        if (matcher.contains (input, regexp.pattern)) {
            com.oroinc.text.regex.MatchResult m = matcher.getMatch ();
            Page page = source.getSource ();
            
            Region match = new Region (page, m.beginOffset (0), m.endOffset (0));
            
            int n = m.groups()-1;
            Region[] groups = new Region[n];
            for (int i=0; i<n; ++i) {
                Region r = new Region (page, m.beginOffset (i+1), m.endOffset (i+1));
                groups[i] = r;
                match.setField (regexp.fields[i], r);
            }
            match.setFields (Pattern.groups, groups);
            return match;
        }
        else
            return null;
    }
}
