#!/usr/local/perl5/bin/perl -w
#

use strict;

# Global variables
#
my %frefs;              # file entities declared in internal subset
my %element_frequency;  # element frequency list
my $lastline = "";      # the last line parsed
my $allnodecount = 0;   # total number of nodes parsed
my %nodecount =         # how many nodes have been parsed
                             (
			      'attribute'   => 0,
			      'CDMS'        => 0,
			      'comment'     => 0,
			      'element'     => 0,
			      'PI'          => 0,
			      'text'        => 0,
			      );

# start the process
&main();

# main
# ----
# parse XML document and print out statistics
#
sub main {
    # read document, starting at top-level file
    my $file = shift @ARGV;   
    unless( $file && -e $file ) {
	print "File '$file' not found.\n";
	exit(1);
    }
    my $text = &process_file( $file );
    
    # parse the document entity
    &parse_document( $text );

    # print node stats
    print "\nNode frequency:\n";
    my $type;
    foreach $type (keys %nodecount) {
	print "  " . $nodecount{ $type } . "\t" . $type . " nodes\n";
    }
    print "\n  " . $allnodecount . "\ttotal nodes\n";

    # print element stats
    print "\nElement frequency:\n";
    foreach $type (sort keys %element_frequency) {
	print "  " . $element_frequency{ $type } . "\t<" . $type . ">\n";
    }
}


# process_file
# ------------
# Get text from all XML files in document. 
#
sub process_file {
    my( $file ) = @_;
    unless( open( F, $file )) {
	print STDERR "Can't open \"$file\" for reading.\n";
	return "";
    }
    my @lines = <F>;
    close F;
    my $line;
    my $buf = "";
    my $linenumber = 0;
    foreach $line (@lines) {

	# Tack on line number and filename information
	$linenumber ++;
	$buf .= "%$file:$linenumber%";
	# Replace external entity references with file contents
	if( $line =~ /\&([^;]+);/ && $frefs{$1} ) {
	    my( $pre, $ent, $post ) = ($`, $&, $' );
	    my $newfile = $frefs{$1};
	    $buf .= $pre . $ent . "\n<?xml-file startfile: $newfile ?>" .
		&process_file( $frefs{$1} ) . "<?xml-file endfile ?>" . 
		    $post;
	} else {
	    $buf .= $line;
	}

	# Add declared external entities to the list.
	# NOTE: we do not handle PUBLIC identifiers!
	$frefs{ $1 } = $2 
	    if( $line =~ /<!ENTITY\s+(\S+)\s+SYSTEM\s+\"([^\"]+)/ );
    }
    return $buf;
}


# parse_document
# --------------
# Read nodes at top level of document.
#
sub parse_document {
    my( $text ) = @_;
    while( $text ) {
	$text = &get_node( $text );
    }
}


# get_node
# --------
# Given a piece of XML text, return the first node found
# and return the rest of the text string.
#
sub get_node {
    my( $text ) = @_;
    
    # text
    if( $text =~ /^[^<]+/ ) {
	$text = $';
	$nodecount{ 'text' } ++;

    # imperative markup: comment, marked section, declaration
    } elsif( $text =~ /^\s*<\!/ ) {

	# comment
	if( $text =~ /^\s*<\!--(.*?)-->/s ) {
	    $text = $';
	    $nodecount{ 'comment' } ++;
	    my $data = $1;
	    if( $data =~ /--/ ) {
		&parse_error( "comment contains partial delimiter (--)" );
	    }

	# CDATA marked section (treat this like a node)
	} elsif( $text =~ /^\s*<\!\[\s*CDATA\s*\[/ ) {
	    $text = $';
	    if( $text =~ /\]\]>/ ) {
		$text = $';
	    } else {
		&parse_error( "CDMS syntax" );
	    }
	    $nodecount{ 'CDMS' } ++;

	# document type declaration
	} elsif( $text =~ /^\s*<!DOCTYPE.*?\]>\s*/s ||
		 $text =~ /^\s*<!DOCTYPE.*?>\s*/s ) {
	    $text = $';

	# parse error
	} else {
	    &parse_error( "declaration syntax" );
	}

    # processing instruction
    } elsif( $text =~ /^\s*<\?/ ) {
	if( $text =~ /^\s*<\?\s*[^\s\?]+\s*.*?\s*\?>\s*/s ) {
	    $text = $';
	    $nodecount{ 'PI' } ++;
	} else {
	    &parse_error( "PI syntax" );
	}

    # element
    } elsif( $text =~ /\s*</ ) {

	# empty element with atts
	if( $text =~ /^\s*<([^\/\s>]+)\s+([^\s>][^>]+)\/>/) {
	    $text = $';
	    $element_frequency{ $1 } ++;
	    my $atts = $2;
	    &parse_atts( $atts );
	    
	# empty element, no atts
	} elsif( $text =~ /^\s*<([^\/\s>]+)\s*\/>/) {
	    $text = $';
	    $element_frequency{ $1 } ++;

	# container element
	} elsif( $text =~ /^\s*<([^\/\s>]+)[^<>]*>/) {
	    my $name = $1;
	    $element_frequency{ $name } ++;
	    
	    # process attributes
	    my $atts = "";
	    $atts = $1 if( $text =~ /^\s*<[^\/\s>]+\s+([^\s>][^>]+)>/);
	    $text = $';
	    &parse_atts( $atts ) if $atts;
	    # process child nodes
	    while( $text !~ /^<\/$name\s*>/ ) {
		$text = &get_node( $text );
	    }
	    # check for end tag
	    if( $text =~ /^<\/$name\s*>/ ) {  
		$text = $';
	    } else {
		&parse_error( "end tag for element <$name>" );
	    }
	    $nodecount{ 'element' } ++;

        # some kind of parse error	    
	} else {
            if( $text =~ /^\s*<\/([^>]+)/ ) {
                &parse_error( "missing start tag for element <$1>" );
	    } else {
		&parse_error( "reserved character (<) in text" );
	    }
	}

    } else {
	&parse_error( "unidentified text" );
    }

    # update running info
    $allnodecount ++;
    $lastline = $& if( $text =~ /%[:]+:[:]+/ );
    return $text;
}


# parse_atts
# ----------
# verify syntax of attributes
#
sub parse_atts {
    my( $text ) = @_;
    $text =~ s/%.*?%//sg;
    while( $text ) {
	if( $text =~ /\s*([^\s=]+)\s*=\s*([\"][^\"]*[\"])/ ||
	    $text =~ /\s*([^\s=]+)\s*=\s*([\'][^\']*[\'])/) {
	    $text = $';
	    $nodecount{'attribute'} ++;
	    $allnodecount ++;
	} elsif( $text =~ /^\s+/ ) {
	    $text = $';
	} else {
	    &parse_error( "attribute syntax" );
	}
    }
}


# parse_error
# -----------
# abort parsing and print error message with line number and file name
# where error occured
#
sub parse_error {
    my( $reason ) = @_;
    my $line = 0;
    my $file = "unknown file";
    if( $lastline =~ /%([^:]+):([^%]+)/ ) {
	$file = $1;
	$line = $2 - 1;
    }
    die( "Parse error on line $line in $file: $reason.\n" );
}
