#! /usr/bin/perl use strict; use warnings; use HTML::TreeBuilder; sub find_elements_by_tag_names($@) { my ($elem, @tagnames) = @_; if (ref($elem) ne 'HTML::TreeBuilder' && ref($elem) ne 'HTML::Element') { die('Invalid argument to find_elements_by_tag_names'); } my @list = $elem->content_list(); my $tag; my @retlist = (); foreach (@list) { if (ref($_) ne 'HTML::Element') { next; } $tag = $_->tag(); if (grep($_ eq $tag, @tagnames)) { push(@retlist, $_); } } return @retlist; } sub number_sections($) { my ($doc) = @_; # my $doc = shift; if (ref($doc) ne 'HTML::TreeBuilder') { die('Invalid argument to number_sections'); } my @body = find_elements_by_tag_names($doc, ('body')); my @headers = find_elements_by_tag_names($body[0], ('h1', 'h2', 'h3', 'h4', 'h5', 'h6')); my @secnums = (0, 0, 0, 0, 0, 0); my $curlev = -1; my $newlev; my $secstr; foreach (@headers) { $newlev = $_->tag(); $newlev =~ s/^h(\d)$/$1/; if ($newlev != $curlev) { foreach (@secnums[$newlev .. $#secnums]) { $_ = 0; } } ++$secnums[$newlev - 1]; $secstr = join('.', @secnums); $secstr =~ s/(\.0)*$//; print($secstr, ":\t\"", $_->content_list(), "\"\n"); $curlev = $newlev; } } sub main() { my $doc = HTML::TreeBuilder->new(); while (<>) { $doc->parse($_); } number_sections($doc); } main();