Web::Chain project: Web/Chain/IO/Html.pm
package Web::Chain::IO::Html;
# doom@kzsu.stanford.edu
# 12 Oct 2004
=head1 NAME
Web::Chain::IO::Html - adds the capability to the IO object to handle the
DF Html format.
=head1 SYNOPSIS
See Web::Chain::IO;
=head1 DESCRIPTION
This class returns an input handle used dynamically
by a Web::Chain::IO handle.
It provides input and output methods for files of
the DF (aka "Doomfiles") Html format.
Two different types of input are supported by this
module:
(1) It can do a light weight skim through a chain of files,
following the next (or prev) links to record information
about the browse sequence.
(2) It can load the nodes of a chain with all information
in the body of html files (node name, next and prev links,
and the main body).
=head1 METHODS
=over
=cut
use 5.006;
use strict;
use warnings;
use Carp;
use File::Basename;
use base 'Web::Chain::IO::Common'; # _handle_input_arguments, etc.
use Web::Chain;
use Web::Definitions qw(
$DEBUG
$DF_VERSION
$DF_EXTRACT_BODY_RULE
$DF_EXTRACT_PREV_NODE_RULE
$DF_EXTRACT_NEXT_NODE_RULE
$DF_EXTRACT_H1_RULE
$DF_TOPNODE_NAME
$DF_BOTNODE_NAME
$DF_THOUGHTS_LINK_RULE
$DF_WHATSNEW_NODE_NAME
$DF_CONTENTS_NODE_NAME
);
use Web::Pro::Transform qw( html2xml xml2html html2xml_simple);
use Web::Pro::HtmlOutput qw( header
footer
first_header
last_footer
text2html
add_node_list_to_whatsnew_body
nodelist_to_html
);
our $VERSION = $DF_VERSION;
=item B<new> - create an input/output handle, used by a
Web::Chain::IO object.
Ex. my $io_handle = $io_class->new( $self );
See Web::Chain::IO
=cut
sub new {
my $class = shift;
my $io_object = shift;
bless {
_io_object => $io_object, # pointing back at the owning object
_input_location => $io_object->input_location,
_output_location => $io_object->output_location,
} , $class;
}
=item B<input_location> - return the input_location
=cut
sub input_location {
my ($self) = @_;
my $subname = ( caller(0) )[3];
return $self->{_input_location};
}
=item B<input> -
Read some doomfiles nodes in html format from the input_location,
return a newly created chain object containing the information.
Uses two ordered arguments which are passed in from the IO input method:
First, the name of the node (or the file, with '.html' extension)
to begin with, second an argument to specify where in the
browse sequence to terminate the input. If the first argument is
undefined, it will start from the top node name defined in
Definitions.pm (typically 'TOP.html', as of this writing).
The second argument can be one of three things: undefined,
which means to continue to the end of the chain, the final
node (or file) name to be input, or it can be the total
number of nodes (an integer).
Example usage:
my $dfh = Web::Chain::IO->new; # Create a doomfiles io handle
$dfh->input_location($input_location); # the current input directory
$dfh->input_format('Html'); # requires Web::Chain::IO::Input::Html
my $chain = $dfh->input('FIN', 1); # should read just the node "FIN"
# Alternate:
my $chain = $dfh->input('TOP', 'DATA'); # should read sequence TOP -> DATA
# Alternate:
my $chain = $dfh->input(); # read the whole sequence
=cut
sub input {
my ($self, $begin, $termination) = @_;
my $subname = ( caller(0) )[3];
my ($begin_node_name, $end_node_name, $fh, $after_location);
my ($prev_node_name, $next_node_name, $body);
# Put this routine into slurp mode
local $/;
undef $/;
($begin_node_name, $end_node_name) =
$self->_handle_input_arguments( $begin, $termination );
# check input_location
my $loc = $self->input_location;
chdir($loc);
my $count = scalar( my @files = <*.html> );
unless($count) { return undef; }; # If there are no files in the input location, give up.
my $chain = Web::Chain->new();
my $node_name = $begin_node_name; # initialize
do { # loop until node name is the last node name, test at end of loop
my $file = "$loc/$node_name.html";
($DEBUG) && print STDERR "Working on file: $file\n";
open $fh, "<$file" or croak "$subname: Can't open $file for input: $!";
my $contents = <$fh>;
# extract parts (next, body, prev)
($prev_node_name) = ($contents =~ m{$DF_EXTRACT_PREV_NODE_RULE} );
($next_node_name) = ($contents =~ m{$DF_EXTRACT_NEXT_NODE_RULE} );
($body) = ($contents =~ m{$DF_EXTRACT_BODY_RULE} );
if ($node_name eq $begin_node_name) { # If the first node, we've found the _beg_prev value
$chain->set_beg_prev( $prev_node_name );
}
my $body_ref = html2xml_simple(\$body); # switching to html2xml_simple for now.
# html2xml was ampup-ing comments: <!-- -->
my $node = $chain->add_link($node_name, $after_location);
$node->set_body($body_ref);
close $fh;
# set next $node_name value to next, then loop.
$after_location = $node_name; # Save last value
$node_name = $next_node_name; # Set up the next value
### TODO - check for infinite loop (tangled chain?)
} until ( ($after_location eq $end_node_name) ); # Just did the last node, so stop
$chain->set_end_next( $next_node_name );
return $chain;
}
=item B<get_browse_sequence> - takes a single argument, the location,
and determines the browse sequence for the existing DF nodes there.
This is essentially an internally used routine
containing code to implement the following two methods.
This version is capable of getting a sequence from locations
without a TOP or a FIN node.
=cut
sub get_browse_sequence {
my ($self, $location) = @_;
my $subname = ( caller(0) )[3];
my (@files, $seed);
# Go into file slurp mode (this sub only)
local $/;
undef $/;
# check the location
chdir($location);
my $count = scalar( @files = <*.html> );
unless($count) { return undef; }; # If there are no files in the input location, give up.
my $chain = Web::Chain->new();
my $begin_node_name = $DF_TOPNODE_NAME;
my $end_node_name = $DF_BOTNODE_NAME;
my $topfile = "$location/$begin_node_name.html";
unless (-e $topfile) {
($DEBUG) && print STDERR "Default first node called $begin_node_name doesn't exist. Scanning for alternate...";
($seed = $files[0]) =~ s/\.html$//;
$begin_node_name = $self->{_io_object}->find_first_node_from_input($seed);
($DEBUG) && print STDERR "Found beginning of sequence: $begin_node_name";
}
my ($after_location);
my $node_name = $begin_node_name; # initialize
LOOP: {
do { # loop until node name is the last node name
my $file = "$location/$node_name.html";
($DEBUG) && print STDERR "$subname: working on file: $file\n";
my ($fh);
open $fh, "<$file" or croak "$subname: Can't open $file for input: $!";
my $contents = <$fh>;
# extract parts (next, body, prev)
my ($prev_node_name) = ($contents =~ m{$DF_EXTRACT_PREV_NODE_RULE} );
my ($next_node_name) = ($contents =~ m{$DF_EXTRACT_NEXT_NODE_RULE} );
my $node = $chain->add_link($node_name, $after_location);
close $fh;
# set next $node_name value to next, then loop.
$after_location = $node_name; # Save last value
$node_name = $next_node_name; # Set up the next value
### TODO - check for infinite loop (because of tangled chain)
last if not($next_node_name);
} until ( ($after_location eq $end_node_name) ); # Just did the last node, so stop
} # end LOOP
my @list = $chain->get_names;
return \@list;
}
=item B<get_browse_sequence_from_output> - returns a reference to a list
of all doomfiles html pages in the output location, in the order
of the browse sequence.
=cut
sub get_browse_sequence_from_output{
my ($self) = @_;
my $subname = ( caller(0) )[3];
$self->get_browse_sequence( $self->output_location() );
}
=item B<get_browse_sequence_from_input> - returns a reference to a list
of all doomfiles html pages in the input location, in the order
of the browse sequence.
=cut
sub get_browse_sequence_from_input {
my ($self) = @_;
my $subname = ( caller(0) )[3];
$self->get_browse_sequence( $self->input_location() );
}
=item B<output_location> - return the output_location
=cut
sub output_location {
my ($self) = @_;
my $subname = ( caller(0) )[3];
return $self->{_output_location};
}
=item B<output> -
writes some nodes to the output_location in HTML format.
Uses three ordered arguments which are passed
in from the IO output method: First, a chain object, second,
the node in the chain to begin from -- as usual, either the
node object or the node name work, and the third which specifies
where in the chain to terminate output. The third argument
can be one of three things: undefined, which means to continue
to the end of the chain, the final node to be output, or it
can be the total number of nodes (an integer).
Example usage:
$io_obj->output( $chain, $begin_node, $termination );
=cut
sub output {
my ($self, $chain, $begin, $termination) = @_;
my $subname = ( caller(0) )[3];
my ($begin_node, $end_node, $begin_name, $end_name, $fh, $filename);
($begin_node, $end_node) =
$self->_handle_output_arguments( $chain, $begin_node, $termination );
my $node = $begin_node; # Initialize to first node in chain
unless ($node) { return undef; } # If chain is empty, give up now.
# qualify output_location, initialize actual physical location
my $loc = $self->output_location;
mkdir($loc) or croak "$subname: Could not mkdir $loc: $!" unless -d $loc;
if ($DEBUG) { # check to make sure this is not a production location
chdir($loc);
my @htmlfiles = glob <*.html>;
if ($#htmlfiles > 25) {
croak "Too many html files in the output location for a safe test run";
}
}
while ( 1 ) { # step forward through chain, stop when we find $end_node or hit the end
$filename = $node->get_name . '.html'; # generate $filename from node name
chdir($loc);
open $fh, ">$loc/$filename" or croak "Couldn't open $loc/$filename for output: $!";
if( $node->get_prev ){
print $fh header( $node->get_name, $node->get_prev->get_name );
} else { # First node: if "get_beg_prev" is defined, use that in prev link,
# otherwise fall back on "first_header" (with no prev link).
my $prev_name;
if ( $prev_name = $chain->get_beg_prev ) {
print $fh header( $node->get_name, $prev_name );
} else {
print $fh first_header( $node->get_name );
}
}
my $bodyref = xml2html( $node->get_body );
print $fh ${ $bodyref };
if ( $node->get_next ) {
print $fh footer( $node->get_next->get_name );
} else { # Last node, if "end next" is defined, use it, else just use footer (no next node)
my $next_name;
if ( $next_name = $chain->get_end_next ) {
print $fh footer( $next_name );
} else {
print $fh last_footer();
}
}
close $fh;
# Just being paranoid about possible crazy loop structures
my $i++;
my $limit = 1000; # TODO increase in production use
carp "$subname: over $limit iterations, looks like infinite loop" if $i > $limit;
if ($node eq $end_node) { last; } ; # exit loop if we've found the $end_node
unless ( $node = $node->next ) { last; } # advance to next node in the chain, exit if at end
}
return 1; # TODO any better return value?
}
=item B<output_splice> - output_splice is similar to L<output>
though it is designed to be used with an output_location
that already has some content: output_splice outputs an
entire chain at some position in the existing browse
sequence, splicing the ends of the chain and the two
existing nodes that are affected. It takes two
arguments, a chain object, and the name of the position
in the existing browse sequence. This position is
expected to match the DF name pattern, and for output_splice
to take place, all names involved have to be unique, i.e.
none of the node names in the chain can already be in use
in the output location.
Example usage:
$self->output_splice( $chain, $position );
=cut
sub output_splice {
my ($self, $chain, $position) = @_;
my $subname = ( caller(0) )[3];
my ($begin_node, $end_node, $begin_name, $end_name, $fh, $filename);
my $original_input_location = $self->input_location;
$self->{_input_location} = $self->output_location;
### TODO - just use accessor? Write "set_input_location" method...
### Any problems with the dynamic load trick? (Need more test cases)
my $browse_ref = $self->get_browse_sequence_from_output;
# Check that the node names in the chain to be output
# do not already exist in the browse sequence
my @new_nodes = $chain->get_names;
my %browse_exists = ();
foreach ( @{$browse_ref} ) {
$browse_exists{$_} = 1;
}
foreach ( @new_nodes ) {
if ($browse_exists{$_}) {
$self->{_input_location}= $original_input_location;
croak "$subname: there is already a node named $_ in the output location"; }
}
($DEBUG) && print STDERR "browse_exists hash: " . join(" ", (keys %browse_exists) ) . "\n";
# Make sure that $position exists in the output location
unless ($browse_exists{ $position }) {
$self->{_input_location}= $original_input_location;
croak "$subname: Can't find $position in the output location: " . $self->output_location;
}
# read-in a two segment chain, position and the location
# immediately following it.
my $chain_frame = $self->input( $position, 2 );
# insert the chain of new material into this framework
$chain_frame->insert_segment($chain, $position);
# Then just do the old normal "output" method on this modified
# chain segment. The first and last nodes now over-write
# the existing html files (this is a file overwrite, but not a
# node overwrite: from the node/chain point of view, this is
# just a link update)
# Pass on to output the entire modified chain (last two args = undef)
my $ret =
$self->output( $chain_frame );
return $ret;
}
=back
=head2 Probing a Node's Meta_info
The following two routines look-up a node by name
(from either the input or the output location) returning
three items, the name of the node, the next node link,
and the prev node link. Returning the node
name when node name is also the argument might seem
redundant, but (a) it can be treated as a flag to
determine if the input was sucessful (prev or next would
be undef for the first and last nodes, respectively) and
also (b) where possible the format-specific code will
read the node name in a different way than the lookup is
performed -- e.g. in the case of Html, the returned name is
that of the page title, but the given name is the file
name.
=over
=item B<input_meta_info> - internally used routine that takes two
arguments, the location and the node name, and does the real
work of looking up the meta information.
Used by input_meta_info_from_input and input_meta_info_from_output.
Note that this code is intentionally kept quite simple, and
does no argument checking of any sort (not even a pattern
match to make sure it's a well-formed DF name). It is
indirectly used by other code that does argument checking.
=cut
sub input_meta_info {
my ($self, $loc, $name) = @_;
my $subname = ( caller(0) )[3];
my ($fh, $filename, $text, $title_h1, $next, $prev);
$filename = "$loc/$name.html";
open $fh, "<$filename" or croak "$subname: Could not open $filename for input: $!";
local $/;
undef $/;
$text = <$fh>;
# extract parts (next, body, title (from h1 tags)
($prev) = ($text =~ m{$DF_EXTRACT_PREV_NODE_RULE} );
($next) = ($text =~ m{$DF_EXTRACT_NEXT_NODE_RULE} );
($title_h1) = ($text =~ m{$DF_EXTRACT_H1_RULE} );
return ($title_h1, $prev, $next);
}
=item B<input_meta_info_from_input> - given a node name, gets meta
information for a node located in the input location.
Example usage:
($node_name, $next_name, $prev_name) = $io_handle->input_meta_info_from_input($node_name)
=cut
sub input_meta_info_from_input {
my ($self, $name) = @_;
my $subname = ( caller(0) )[3];
my $loc = $self->input_location();
$self->input_meta_info($loc, $name);
}
=item B<input_meta_info_from_output> - given a node name, gets meta
information for a node located in the output location.
Example usage:
($node_name, $next_name, $prev_name) = $io_handle->output_meta_info_from_output($node_name)
=cut
sub input_meta_info_from_output {
my ($self, $name) = @_;
my $subname = ( caller(0) )[3];
my $loc = $self->output_location();
$self->input_meta_info($loc, $name);
}
=item B<log_new_names> - given a reference to an array of node
names, include a list of them in a special node reserved
for the purpose of logging these additions. The default name for
that node is defined in Web::Definitions as $DF_WHATSNEW_NODE_NAME,
it can be overridden by specifying it as the optional second
argument. The return value is the date stamp value used in the log
is returned (this is largely for testing purposes: it makes the
exact output a little easier to predict).
Example usage:
$dfh->log_new_names( \@new_node_names, $log_node );
=cut
sub log_new_names {
my ($self, $new_node_list_ref, $log_node_name) = @_;
my $subname = ( caller(0) )[3];
my $node_name = $log_node_name || $DF_WHATSNEW_NODE_NAME;
my $original_input_location = $self->{_input_location};
$self->{_input_location} = $self->{_output_location};
my $chain = $self->input($node_name, 1); # read just the one log node
my $node = $chain->get_node_from_name($node_name);
my $content_ref = $node->get_body;
my $datestamp =
add_node_list_to_whatsnew_body($new_node_list_ref, $content_ref);
# (Note that we need not do a "set_body" because we're acting on
# a reference to the same text that the node references.
# Similarly, no chain manipulation is needed, the node is there
# already, we're modifying it in place.)
$self->output( $chain, $node_name, 1 );
$self->{_input_location}= $original_input_location;
return $datestamp; # This return is useful for writing test code
}
=item B<generate_contents_node> -
re-generate the "contents"
(think "table of contents") node, a listing of all nodes
in the project (i.e. in the output location).
The default name for that node is defined in
Web::Definitions as $DF_CONTENTS_NODE_NAME, it can be
overridden with the optional argument. This method
reads in the existing file first, updates it, and
writes it out again: this ensures that next and prev links
remain intact. The main body of the node is replaced by
a new list describing the current browse sequence,
though any introductory remarks before the first link will
be preserved.
=cut
sub generate_contents_node {
my ($self, $contents_node) = @_;
my $subname = ( caller(0) )[3];
my $contents_name = $contents_node || $DF_CONTENTS_NODE_NAME;
my $original_input_location = $self->{_input_location};
$self->{_input_location} = $self->{_output_location};
my $chain = $self->input($contents_name, 1); # read just the one contents node
my $node = $chain->get_node_from_name($contents_name);
my $body_ref = $node->get_body;
# Preserve the introduction before the first html link
my ($lead_paragraph) = ( ${$body_ref} =~ m{ \A (.*? \s* $) \s* <A \s+ HREF}msx );
my $browse_ref = $self->get_browse_sequence_from_output;
my $text_ref = nodelist_to_html($browse_ref, ' ');
${ $body_ref } = $lead_paragraph . "\n" . ${ $text_ref } . "\n\n";
$self->output( $chain, $contents_name, 1 );
$self->{_input_location}= $original_input_location;
}
=back
=head2 PROCEEDURAL ROUTINES
Some older routines imported from a proceedural codebase.
There's no point in being any fancier than this, since
a re-write to use Mason components is in the works.
These have all been moved to:
Web::Pro::HtmlOutput
=over
=item header
=item footer
=item first_header
=item last_footer
=item text2html
=cut
1;
__END__
=back
=head1 TODO
Future development plans:
The conversion of the main body to html, and the html format
of the headers and footers, is currently handled by some
proceedural routines, closely based on the original legacy scripts.
Ultimately, I expect to replace with a templating system (most
likely Mason, though TextTemplate is a candidate).
=head1 SEE ALSO
L<Project Documentation|Web::Project>
=head1 AUTHOR
Joseph Brenner, E<lt>doom@kzsu.stanford.eduE<gt>
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2004 by Joseph Brenner
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.2 or,
at your option, any later version of Perl 5 you may have available.
=head1 BUGS
None reported... yet.
=cut
Joseph Brenner,
Sat Nov 6 17:04:11 2004