#!/usr/local/perl5/bin/perl -w # use strict; # Global variables # my %frefs; # file entities declared in internal subset my %element_frequency; # element frequency list my $lastline = ""; # the last line parsed my $allnodecount = 0; # total number of nodes parsed my %nodecount = # how many nodes have been parsed ( 'attribute' => 0, 'CDMS' => 0, 'comment' => 0, 'element' => 0, 'PI' => 0, 'text' => 0, ); # start the process &main(); # main # ---- # parse XML document and print out statistics # sub main { # read document, starting at top-level file my $file = shift @ARGV; unless( $file && -e $file ) { print "File '$file' not found.\n"; exit(1); } my $text = &process_file( $file ); # parse the document entity &parse_document( $text ); # print node stats print "\nNode frequency:\n"; my $type; foreach $type (keys %nodecount) { print " " . $nodecount{ $type } . "\t" . $type . " nodes\n"; } print "\n " . $allnodecount . "\ttotal nodes\n"; # print element stats print "\nElement frequency:\n"; foreach $type (sort keys %element_frequency) { print " " . $element_frequency{ $type } . "\t<" . $type . ">\n"; } } # process_file # ------------ # Get text from all XML files in document. # sub process_file { my( $file ) = @_; unless( open( F, $file )) { print STDERR "Can't open \"$file\" for reading.\n"; return ""; } my @lines = ; close F; my $line; my $buf = ""; my $linenumber = 0; foreach $line (@lines) { # Tack on line number and filename information $linenumber ++; $buf .= "%$file:$linenumber%"; # Replace external entity references with file contents if( $line =~ /\&([^;]+);/ && $frefs{$1} ) { my( $pre, $ent, $post ) = ($`, $&, $' ); my $newfile = $frefs{$1}; $buf .= $pre . $ent . "\n" . &process_file( $frefs{$1} ) . "" . $post; } else { $buf .= $line; } # Add declared external entities to the list. # NOTE: we do not handle PUBLIC identifiers! $frefs{ $1 } = $2 if( $line =~ //s ) { $text = $'; $nodecount{ 'comment' } ++; my $data = $1; if( $data =~ /--/ ) { &parse_error( "comment contains partial delimiter (--)" ); } # CDATA marked section (treat this like a node) } elsif( $text =~ /^\s*<\!\[\s*CDATA\s*\[/ ) { $text = $'; if( $text =~ /\]\]>/ ) { $text = $'; } else { &parse_error( "CDMS syntax" ); } $nodecount{ 'CDMS' } ++; # document type declaration } elsif( $text =~ /^\s*\s*/s || $text =~ /^\s*\s*/s ) { $text = $'; # parse error } else { &parse_error( "declaration syntax" ); } # processing instruction } elsif( $text =~ /^\s*<\?/ ) { if( $text =~ /^\s*<\?\s*[^\s\?]+\s*.*?\s*\?>\s*/s ) { $text = $'; $nodecount{ 'PI' } ++; } else { &parse_error( "PI syntax" ); } # element } elsif( $text =~ /\s*]+)\s+([^\s>][^>]+)\/>/) { $text = $'; $element_frequency{ $1 } ++; my $atts = $2; &parse_atts( $atts ); # empty element, no atts } elsif( $text =~ /^\s*<([^\/\s>]+)\s*\/>/) { $text = $'; $element_frequency{ $1 } ++; # container element } elsif( $text =~ /^\s*<([^\/\s>]+)[^<>]*>/) { my $name = $1; $element_frequency{ $name } ++; # process attributes my $atts = ""; $atts = $1 if( $text =~ /^\s*<[^\/\s>]+\s+([^\s>][^>]+)>/); $text = $'; &parse_atts( $atts ) if $atts; # process child nodes while( $text !~ /^<\/$name\s*>/ ) { $text = &get_node( $text ); } # check for end tag if( $text =~ /^<\/$name\s*>/ ) { $text = $'; } else { &parse_error( "end tag for element <$name>" ); } $nodecount{ 'element' } ++; # some kind of parse error } else { if( $text =~ /^\s*<\/([^>]+)/ ) { &parse_error( "missing start tag for element <$1>" ); } else { &parse_error( "reserved character (<) in text" ); } } } else { &parse_error( "unidentified text" ); } # update running info $allnodecount ++; $lastline = $& if( $text =~ /%[:]+:[:]+/ ); return $text; } # parse_atts # ---------- # verify syntax of attributes # sub parse_atts { my( $text ) = @_; $text =~ s/%.*?%//sg; while( $text ) { if( $text =~ /\s*([^\s=]+)\s*=\s*([\"][^\"]*[\"])/ || $text =~ /\s*([^\s=]+)\s*=\s*([\'][^\']*[\'])/) { $text = $'; $nodecount{'attribute'} ++; $allnodecount ++; } elsif( $text =~ /^\s+/ ) { $text = $'; } else { &parse_error( "attribute syntax" ); } } } # parse_error # ----------- # abort parsing and print error message with line number and file name # where error occured # sub parse_error { my( $reason ) = @_; my $line = 0; my $file = "unknown file"; if( $lastline =~ /%([^:]+):([^%]+)/ ) { $file = $1; $line = $2 - 1; } die( "Parse error on line $line in $file: $reason.\n" ); }