Web::Chain project:    Web/Chain/IO/Html.pm


     package Web::Chain::IO::Html;
#                                doom@kzsu.stanford.edu
#                                12 Oct 2004

=head1 NAME

Web::Chain::IO::Html - adds the capability to the IO object to handle the 
  DF Html format.

=head1 SYNOPSIS

   See Web::Chain::IO;

=head1 DESCRIPTION

This class returns an input handle used dynamically
by a Web::Chain::IO handle.  

It provides input and output methods for files of 
the DF (aka "Doomfiles") Html format.

Two different types of input are supported by this 
module: 

(1) It can do a light weight skim through a chain of files,
following the next (or prev) links to record information
about the browse sequence.

(2) It can load the nodes of a chain with all information 
in the body of html files (node name, next and prev links, 
and the main body). 

=head1 METHODS

=over 

=cut

use 5.006;
use strict; 
use warnings;
use Carp;
use File::Basename;

use base 'Web::Chain::IO::Common';  # _handle_input_arguments, etc.

use Web::Chain;
use Web::Definitions qw(
                        $DEBUG
                        $DF_VERSION
                        $DF_EXTRACT_BODY_RULE
                        $DF_EXTRACT_PREV_NODE_RULE
                        $DF_EXTRACT_NEXT_NODE_RULE
                        $DF_EXTRACT_H1_RULE
                        $DF_TOPNODE_NAME
                        $DF_BOTNODE_NAME
                        $DF_THOUGHTS_LINK_RULE
                        $DF_WHATSNEW_NODE_NAME
                        $DF_CONTENTS_NODE_NAME
                       );
use Web::Pro::Transform qw( html2xml xml2html html2xml_simple);

use Web::Pro::HtmlOutput qw( header
                             footer
                             first_header
                             last_footer
                             text2html
                             add_node_list_to_whatsnew_body
                             nodelist_to_html
                           );

our $VERSION = $DF_VERSION;

=item B<new> - create an input/output handle, used by a 
   Web::Chain::IO object.
   Ex.  my $io_handle = $io_class->new( $self );
   See Web::Chain::IO

=cut

sub new { 
  my $class = shift;
  my $io_object = shift;
  bless { 
         _io_object => $io_object, # pointing back at the owning object
         _input_location => $io_object->input_location,
         _output_location => $io_object->output_location,
        } , $class;
}

=item B<input_location> - return the input_location 

=cut

sub input_location {   
  my ($self) = @_;
   my $subname = ( caller(0) )[3];
   return $self->{_input_location};
}


=item B<input> -
  Read some doomfiles nodes in html format from the input_location, 
  return a newly created chain object containing the information.
  Uses two ordered arguments which are passed in from the IO input method: 
  First, the name of the node (or the file, with '.html' extension) 
  to begin with, second an argument to specify where in the 
  browse sequence to terminate the input. If the first argument is 
  undefined, it will start from the top node name defined in 
  Definitions.pm (typically 'TOP.html', as of this writing).
  The second argument can be one of three things: undefined,
  which means to continue to the end of the chain, the final
  node (or file) name to be input, or it can be the total
  number of nodes (an integer).
  Example usage:

    my $dfh = Web::Chain::IO->new; # Create a doomfiles io handle
    $dfh->input_location($input_location);  # the current input directory
    $dfh->input_format('Html');   # requires Web::Chain::IO::Input::Html
    my $chain = $dfh->input('FIN', 1); # should read just the node "FIN"

  # Alternate:
    my $chain = $dfh->input('TOP', 'DATA'); # should read sequence TOP -> DATA

  # Alternate:
    my $chain = $dfh->input(); # read the whole sequence

=cut 

sub input {

  my ($self, $begin, $termination) = @_;
  my $subname = ( caller(0) )[3];

  my ($begin_node_name, $end_node_name, $fh, $after_location);
  my ($prev_node_name, $next_node_name, $body);

  # Put this routine into slurp mode
  local $/;
  undef $/;

  ($begin_node_name, $end_node_name) = 
     $self->_handle_input_arguments( $begin, $termination );

  # check input_location
  my $loc = $self->input_location;
  chdir($loc);
  my $count = scalar( my @files = <*.html> );
  unless($count) { return undef; }; # If there are no files in the input location, give up.

  my $chain = Web::Chain->new();
  
  my $node_name = $begin_node_name;  # initialize 

  do {  # loop until node name is the last node name, test at end of loop
    my $file = "$loc/$node_name.html";

    ($DEBUG) && print STDERR "Working on file: $file\n";
    open $fh, "<$file" or croak "$subname: Can't open $file for input: $!";

    my $contents = <$fh>;

    # extract parts (next, body, prev) 
    ($prev_node_name) = ($contents =~ m{$DF_EXTRACT_PREV_NODE_RULE} );      
    ($next_node_name) = ($contents =~ m{$DF_EXTRACT_NEXT_NODE_RULE} );  
    ($body) =           ($contents =~ m{$DF_EXTRACT_BODY_RULE} );           

    if ($node_name eq $begin_node_name) {  # If the first node, we've found the _beg_prev value
       $chain->set_beg_prev( $prev_node_name );
    }

    my $body_ref = html2xml_simple(\$body);  # switching to html2xml_simple for now.
                                             # html2xml was ampup-ing comments: <!-- -->
    my $node = $chain->add_link($node_name, $after_location);
    $node->set_body($body_ref);

    close $fh;
    # set next $node_name value to next, then loop.
    $after_location = $node_name; # Save last value
    $node_name = $next_node_name; # Set up the next value

    ### TODO - check for infinite loop (tangled chain?)

  } until ( ($after_location eq $end_node_name) );  # Just did the last node, so stop
  $chain->set_end_next( $next_node_name );

  return $chain;
}

=item B<get_browse_sequence> - takes a single argument, the location, 
  and determines the browse sequence for the existing DF nodes there.
  This is essentially an internally used routine 
  containing code to implement the following two methods.
  This version is capable of getting a sequence from locations 
  without a TOP or a FIN node. 

=cut 

sub get_browse_sequence { 
  my ($self, $location) = @_;
  my $subname = ( caller(0) )[3];

  my (@files, $seed);
  # Go into file slurp mode (this sub only)
  local $/;
  undef $/;

  # check the location
  chdir($location);
  my $count = scalar( @files = <*.html> );
  unless($count) { return undef; }; # If there are no files in the input location, give up.

  my $chain = Web::Chain->new();

  my $begin_node_name = $DF_TOPNODE_NAME;
  my $end_node_name = $DF_BOTNODE_NAME; 

  my $topfile = "$location/$begin_node_name.html";
  unless (-e $topfile) { 
     ($DEBUG) && print STDERR "Default first node called $begin_node_name doesn't exist. Scanning for alternate...";
     ($seed = $files[0]) =~ s/\.html$//;
     $begin_node_name = $self->{_io_object}->find_first_node_from_input($seed);               
     ($DEBUG) && print STDERR "Found beginning of sequence: $begin_node_name";
  }

  my ($after_location);
  my $node_name = $begin_node_name;  # initialize 
  LOOP: { 
    do {       # loop until node name is the last node name 
      my $file = "$location/$node_name.html";
      ($DEBUG) && print STDERR "$subname: working on file: $file\n";
      my ($fh);
      open $fh, "<$file" or croak "$subname: Can't open $file for input: $!";

      my $contents = <$fh>;

      # extract parts (next, body, prev) 
      my ($prev_node_name) = ($contents =~ m{$DF_EXTRACT_PREV_NODE_RULE} );      
      my ($next_node_name) = ($contents =~ m{$DF_EXTRACT_NEXT_NODE_RULE} );  

      my $node = $chain->add_link($node_name, $after_location);

      close $fh;
      # set next $node_name value to next, then loop.
      $after_location = $node_name; # Save last value
      $node_name = $next_node_name; # Set up the next value

      ### TODO - check for infinite loop (because of tangled chain)
      last if not($next_node_name);
    } until ( ($after_location eq $end_node_name) ); # Just did the last node, so stop
  } # end LOOP

  my @list = $chain->get_names;
  return \@list; 

}



=item B<get_browse_sequence_from_output> - returns a reference to a list 
       of all doomfiles html pages in the output location, in the order 
       of the browse sequence.  

=cut

sub get_browse_sequence_from_output{ 
  my ($self) = @_;
  my $subname = ( caller(0) )[3];
  $self->get_browse_sequence( $self->output_location() );
}

=item  B<get_browse_sequence_from_input> - returns a reference to a list 
       of all doomfiles html pages in the input location, in the order 
       of the browse sequence.  

=cut

sub get_browse_sequence_from_input { 
  my ($self) = @_;
  my $subname = ( caller(0) )[3];
  $self->get_browse_sequence( $self->input_location() );
}


=item B<output_location> - return the output_location 

=cut

sub output_location {   
  my ($self) = @_;
   my $subname = ( caller(0) )[3];
   return $self->{_output_location};
}

=item B<output> -
  writes some nodes to the output_location in HTML format.
  Uses three ordered arguments which are passed
  in from the IO output method: First, a chain object, second, 
  the node in the chain to begin from -- as usual, either the
  node object or the node name work, and the third which specifies
  where in the chain to terminate output.  The third argument
  can be one of three things: undefined, which means to continue 
  to the end of the chain, the final node to be output, or it 
  can be the total number of nodes (an integer).  
  Example usage:

    $io_obj->output( $chain, $begin_node, $termination );

=cut 

sub output {

  my ($self, $chain, $begin, $termination) = @_;
  my $subname = ( caller(0) )[3];

  my ($begin_node, $end_node, $begin_name, $end_name, $fh, $filename);

  ($begin_node, $end_node) = 
     $self->_handle_output_arguments( $chain, $begin_node, $termination );

  my $node = $begin_node;            # Initialize to first node in chain
  unless ($node) { return undef;  }  # If chain is empty, give up now.

  # qualify output_location, initialize actual physical location
  my $loc = $self->output_location;
  mkdir($loc) or croak "$subname: Could not mkdir $loc: $!" unless -d $loc; 

  if ($DEBUG) { # check to make sure this is not a production location
     chdir($loc);
     my @htmlfiles = glob <*.html>;
     if ($#htmlfiles > 25) { 
       croak "Too many html files in the output location for a safe test run";
     }
  }

  while ( 1 ) { # step forward through chain, stop when we find $end_node or hit the end
    $filename = $node->get_name . '.html';  # generate $filename from node name
    chdir($loc);
    open $fh, ">$loc/$filename" or croak "Couldn't open $loc/$filename for output: $!";
    if( $node->get_prev ){
      print $fh header( $node->get_name, $node->get_prev->get_name );
    } else { # First node: if "get_beg_prev" is defined, use that in prev link, 
             # otherwise fall back on "first_header" (with no prev link).
      my $prev_name;
      if ( $prev_name = $chain->get_beg_prev ) { 
        print $fh header( $node->get_name, $prev_name );
      } else { 
        print $fh first_header( $node->get_name );
      }

    }
    my $bodyref = xml2html( $node->get_body );
    print $fh ${ $bodyref };
    if ( $node->get_next ) { 
      print $fh footer( $node->get_next->get_name );
    } else { # Last node, if "end next" is defined, use it, else just use footer (no next node)
      my $next_name;
      if ( $next_name = $chain->get_end_next ) { 
        print $fh footer( $next_name );
      } else { 
        print $fh last_footer(); 
      }
    }
    close $fh;

    # Just being paranoid about possible crazy loop structures
    my $i++;
    my $limit = 1000;      # TODO increase in production use
    carp "$subname: over $limit iterations, looks like infinite loop" if $i > $limit;

    if ($node eq $end_node)        { last; } ; # exit loop if we've found the $end_node
    unless ( $node = $node->next ) { last; }   # advance to next node in the chain, exit if at end
  } 

  return 1;  # TODO any better return value? 
}

=item B<output_splice> -  output_splice is similar to L<output>
   though it is designed to be used with an output_location
   that already has some content: output_splice outputs an
   entire chain at some position in the existing browse
   sequence, splicing the ends of the chain and the two
   existing nodes that are affected.  It takes two
   arguments, a chain object, and the name of the position
   in the existing browse sequence.  This position is
   expected to match the DF name pattern, and for output_splice
   to take place, all names involved have to be unique, i.e. 
   none of the node names in the chain can already be in use 
   in the output location.
   Example usage:

     $self->output_splice( $chain, $position );

=cut 

sub output_splice {
  my ($self, $chain, $position) = @_;
  my $subname = ( caller(0) )[3];

  my ($begin_node, $end_node, $begin_name, $end_name, $fh, $filename);

  my $original_input_location = $self->input_location;
  $self->{_input_location} = $self->output_location;
  ### TODO - just use accessor?  Write "set_input_location" method... 
  ###        Any problems with the dynamic load trick? (Need more test cases)  

  my $browse_ref = $self->get_browse_sequence_from_output;  

  # Check that the node names in the chain to be output
  # do not already exist in the browse sequence 

  my @new_nodes = $chain->get_names;

  my %browse_exists = ();
  foreach ( @{$browse_ref} ) {
    $browse_exists{$_} = 1;
  }
  foreach ( @new_nodes ) { 
    if ($browse_exists{$_}) { 
      $self->{_input_location}= $original_input_location;
      croak "$subname: there is already a node named $_ in the output location"; }
  }

  ($DEBUG) && print STDERR "browse_exists hash: " . join(" ", (keys %browse_exists) ) . "\n";

  # Make sure that $position exists in the output location
  unless ($browse_exists{ $position }) { 
    $self->{_input_location}= $original_input_location;
    croak "$subname: Can't find $position in the output location: " . $self->output_location; 
  }
  
  # read-in a two segment chain, position and the location 
  # immediately following it.

   my $chain_frame = $self->input( $position, 2 );

  # insert the chain of new material into this framework

  $chain_frame->insert_segment($chain, $position);

  # Then just do the old normal "output" method on this modified 
  # chain segment.  The first and last nodes now over-write 
  # the existing html files (this is a file overwrite, but not a 
  # node overwrite: from the node/chain point of view, this is 
  # just a link update)  

  # Pass on to output the entire modified chain (last two args = undef)
  my $ret = 
    $self->output( $chain_frame );
  return $ret;
}

=back 

=head2 Probing a Node's Meta_info 

The following two routines look-up a node by name
(from either the input or the output location) returning 
three items, the name of the node, the next node link, 
and the prev node link.    Returning the node
name when node name is also the argument might seem
redundant, but (a) it can be treated as a flag to
determine if the input was sucessful (prev or next would
be undef for the first and last nodes, respectively) and
also (b) where possible the format-specific code will
read the node name in a different way than the lookup is
performed -- e.g. in the case of Html, the returned name is
that of the page title, but the given name is the file
name. 

=over 

=item B<input_meta_info> - internally used routine that takes two 
  arguments, the location and the node name, and does the real 
  work of looking up the meta information. 
  Used by input_meta_info_from_input and input_meta_info_from_output.
  Note that this code is intentionally kept quite simple, and 
  does no argument checking of any sort (not even a pattern 
  match to make sure it's a well-formed DF name).  It is 
  indirectly used by other code that does argument checking.

=cut

sub input_meta_info { 
   my ($self, $loc, $name) = @_;
   my $subname = ( caller(0) )[3];

   my ($fh, $filename, $text, $title_h1, $next, $prev);
   $filename = "$loc/$name.html";
   open $fh, "<$filename" or croak "$subname: Could not open $filename for input: $!";

   local $/;
   undef $/;

   $text = <$fh>;   
      
   # extract parts (next, body, title (from h1 tags) 
   ($prev)     = ($text =~ m{$DF_EXTRACT_PREV_NODE_RULE} );      
   ($next)     = ($text =~ m{$DF_EXTRACT_NEXT_NODE_RULE} );  
   ($title_h1) = ($text =~ m{$DF_EXTRACT_H1_RULE} );           

   return ($title_h1, $prev, $next);

}

=item B<input_meta_info_from_input> - given a node name, gets meta 
   information for a node located in the input location.
   Example usage:

     ($node_name, $next_name, $prev_name) = $io_handle->input_meta_info_from_input($node_name)

=cut 

sub input_meta_info_from_input { 
   my ($self, $name) = @_;
   my $subname = ( caller(0) )[3];

   my $loc = $self->input_location();
   $self->input_meta_info($loc, $name);
}

=item B<input_meta_info_from_output> - given a node name, gets meta 
   information for a node located in the output location.
   Example usage:

     ($node_name, $next_name, $prev_name) = $io_handle->output_meta_info_from_output($node_name)

=cut 

sub input_meta_info_from_output { 
   my ($self, $name) = @_;
   my $subname = ( caller(0) )[3];

   my $loc = $self->output_location();
   $self->input_meta_info($loc, $name);
}

=item B<log_new_names> - given a reference to an array of node 
  names, include a list of them in a special node reserved 
  for the purpose of logging these additions.  The default name for 
  that node is defined in Web::Definitions as $DF_WHATSNEW_NODE_NAME, 
  it can be overridden by specifying it as the optional second
  argument. The return value is the date stamp value used in the log
  is returned (this is largely for testing purposes: it makes the 
  exact output a little easier to predict). 
  Example usage: 

     $dfh->log_new_names( \@new_node_names, $log_node );

=cut 

sub log_new_names {
  my ($self, $new_node_list_ref, $log_node_name) = @_;
  my $subname = ( caller(0) )[3];

  my $node_name = $log_node_name || $DF_WHATSNEW_NODE_NAME;

  my $original_input_location = $self->{_input_location};
  $self->{_input_location} = $self->{_output_location};

  my $chain = $self->input($node_name, 1); # read just the one log node

  my $node = $chain->get_node_from_name($node_name);

  my $content_ref = $node->get_body;

  my $datestamp = 
     add_node_list_to_whatsnew_body($new_node_list_ref, $content_ref);
   
  # (Note that we need not do a "set_body" because we're acting on  
  # a reference to the same text that the node references. 
  # Similarly, no chain manipulation is needed, the node is there 
  # already, we're modifying it in place.)

   $self->output( $chain, $node_name, 1 );
   $self->{_input_location}= $original_input_location;

   return $datestamp;  # This return is useful for writing test code
}

=item B<generate_contents_node> - 
  re-generate the "contents" 
  (think "table of contents") node, a listing of all nodes 
  in the project (i.e. in the output location).
  The default name for that node is defined in
  Web::Definitions as $DF_CONTENTS_NODE_NAME, it can be
  overridden with the optional argument.  This method 
  reads in the existing file first, updates it, and 
  writes it out again: this ensures that next and prev links 
  remain intact.  The main body of the node is replaced by 
  a new list describing the current browse sequence, 
  though any introductory remarks before  the first link will 
  be preserved.

=cut

sub generate_contents_node { 

  my ($self, $contents_node) = @_;
  my $subname = ( caller(0) )[3];
  my $contents_name = $contents_node || $DF_CONTENTS_NODE_NAME;

  my $original_input_location = $self->{_input_location};
  $self->{_input_location} = $self->{_output_location};

  my $chain = $self->input($contents_name, 1); # read just the one contents node

  my $node = $chain->get_node_from_name($contents_name);
  my $body_ref = $node->get_body;

  # Preserve the introduction before the first html link
  my ($lead_paragraph) = ( ${$body_ref} =~ m{ \A (.*? \s* $) \s* <A \s+ HREF}msx );

  my $browse_ref = $self->get_browse_sequence_from_output;  

  my $text_ref = nodelist_to_html($browse_ref, '         ');

  ${ $body_ref } = $lead_paragraph . "\n" . ${ $text_ref } . "\n\n";

  $self->output( $chain, $contents_name, 1 );
  $self->{_input_location}= $original_input_location;

}


=back

=head2 PROCEEDURAL ROUTINES

Some older routines imported from a proceedural codebase. 
There's no point in being any fancier than this, since 
a re-write to use Mason components is in the works. 

These have all been moved to:

Web::Pro::HtmlOutput

=over

=item  header
=item  footer
=item  first_header
=item  last_footer
=item  text2html

=cut

1;
__END__

=back 

=head1 TODO 

Future development plans:

The conversion of the main body to html, and the html format 
of the headers and footers, is currently handled by some 
proceedural routines, closely based on the original legacy scripts. 
Ultimately, I expect to replace with a templating system (most 
likely Mason, though TextTemplate is a candidate). 

=head1 SEE ALSO

L<Project Documentation|Web::Project>

=head1 AUTHOR

Joseph Brenner, E<lt>doom@kzsu.stanford.eduE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2004 by Joseph Brenner

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.2 or,
at your option, any later version of Perl 5 you may have available.

=head1 BUGS

None reported... yet.

=cut

     

Joseph Brenner, Sat Nov 6 17:04:11 2004