/* Introduction a l'informatique II (ITI 1521)
 * Introduction to Computing II (ITI 1121)
 */

/**
 * 
 * @author Marcel Turcotte, Universite d'Ottawa/University of Ottawa
 */

import java.net.URL;
import java.net.MalformedURLException;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.IOException;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.util.NoSuchElementException;

public class HTML {

	// CONSTANT(S)

	// RFC 2396 --- Uniform Resource Identifiers (URI): Generic Syntax
	// Link http://www.ietf.org/rfc/rfc2396.txt

	private static final String urlRegExp = "http\\s*://[-_.!~*';/?:@&=+$,a-z0-9]+";

	// instance variables

	private URL url;          // the URL of this HTML document
	private String content;   // the content of this HTML document
	private final Matcher matcher;  // a matcher to find the next URL in the document

	private String nextURL;   // a cache

	/**
	 * Initialises an HTML object.  Fetches and store the content
	 * of the Web page.
	 * 
	 * @param spec the URL of the document to be represented.
	 * @throws MalformedURLException
	 * @throws IOExceptio
	 */

	public HTML( String spec ) 
	throws MalformedURLException, IOException {

		this.url = new URL( spec );

		getContent();

		Pattern p = Pattern.compile( urlRegExp, Pattern.CASE_INSENSITIVE );
		matcher = p.matcher( content );

		nextURL = null;
	}

	/** Auxilliary method used to fetch the content of the Web document.
	 */

	private void getContent() throws IOException {

		BufferedReader in = new BufferedReader( new InputStreamReader( url.openStream() ) );

		StringBuffer buffer = new StringBuffer();
		String line = null;

		while ( ( line = in.readLine() ) != null ) {  // throws away line separators as well
			buffer.append( line ); // append is more efficient than string concatenation
		}

		in.close();

		content = buffer.toString();
	}

	/** Auxilliary method that is used by nextURL and hasMoreURLs.
	 * 	The nextURL is cached, i.e. in order to determine if there is
	 * 	a next URL we need to find one, hasMoreURLs will return this
	 * 	URL, therefore the URL that been found is saved, in the
	 * 	variable nextURL, and will be returned by the next call to
	 * 	this method.
	 */

	private void getNextURL() {
		if ( nextURL == null ) {
			boolean done = false;
			while ( ! done ) {
				if ( matcher.find() ) {
					String match = matcher.group();
					if ( ( ! match.equals( "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd" ) ) &&
							( ! match.equals( "http://www.w3.org/1999/xhtml" ) ) ) {
						nextURL = match;
						done = true;
					}
				} else {
					nextURL = null;
					done = true;
				}
			}
		}
	}

	/** Returns the next URL found in this page.
	 *
	 * @return the next URL found in the page.
	 * @throws NoSuchElementException if there are no URL is found in this page.
	 */

	public String nextURL() throws NoSuchElementException {
		String savedURL;
		getNextURL();
		if ( nextURL == null ) {
			throw new NoSuchElementException();
		}
		savedURL = nextURL;
		nextURL = null;
		return savedURL;
	}

	/** Returns true there are more URLs that have not yet been
	 *  returned by calls to nextURL
	 *
	 * @return true if this page has more URLs
	 */

	public boolean hasMoreURLs() {
		getNextURL();
		return nextURL != null;
	}

	/** Returns the content of this page.  Note. The line separators
	 *  have been removed.
	 *
	 * @return the content of the Web page designated by the URL.
	 */

	public String getPage() {
		return content;
	}
}
