Web::Chain project:    Web/Pro/Transform.pm


     package Web::Pro::Transform;
#                                doom@kzsu.stanford.edu
#                                08 Oct 2004

=head1 NAME

Web::Pro::Transform - Perl extension for blah blah blah

=head1 SYNOPSIS

   use Web::Pro::Transform qw( text2xml );
   [...]
   $node->set_body( text2xml( $rawtext_ref ) );


=head1 DESCRIPTION

A collection of proceedurally oriented functions for 
transformations of text to and from different markup 
languages.  

This is a set of utilities to facilitate the Web::Chain
project (for improving "doomfiles" processing).


=head2 EXPORT

All of the following functions are OK for export: 

=over

=cut

use 5.006;
use strict; 
use warnings;
use Carp;

use Web::Definitions qw( $DEBUG 
                         $DF_VERSION
                         $DF_THOUGHTS_LINK_RULE
                         $DF_NODE_NAME_RULE
                        );

require Exporter;

our @ISA = qw(Exporter);

our %EXPORT_TAGS = ( 'all' => [ qw( text2xml
                                    xml2text
                                    ampup
                                    deamp
                                    text2html
                                    html2text
                                    xml2html
                                    html2xml
                                    html2xml_simple

                                  ) ] );

our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
our @EXPORT = qw(  );
our $VERSION = $DF_VERSION;

=item B<text2xml> - takes df-style block of text, transforms it 
   into xml used in internal storage.

=cut 

sub text2xml { 
  my $ref = shift;
  # make a copy of the text to work on
  my $text = ${ $ref };

  ampup($text); # Do this first to disambiguate chars < > & from markup

  # Turn doomfiles jumps to xml/html jumps:
  $text =~ s|$DF_THOUGHTS_LINK_RULE|$1<A HREF="$2.html">$2</A>$3|msg;

  # well formed xml requires a top-level entity:
  $text =~ s|^|<NODEBODY>|;
  $text =~ s|$|</NODEBODY>|;

  return \$text;  # return a reference to converted content
}


=item B<xml2html> - takes content in internal xml format, transform it 
   into html for web publication.

=cut 

sub xml2html { 

  my $ref = shift;
  # make a copy of to work on
  my $text = ${ $ref };

  # Strip out top level entities
  $text =~ s|<NODEBODY>||;
  $text =~ s|</NODEBODY>||;

  return \$text;  # return a reference to converted content
}

=item B<html2xml_simple> - takes raw html content, tweaks it into internally 
  used xml storage format.  This is currently in use 
  because the following html2xml displays a bug in converting 
  html comments into ampersand entities.
  You might presume that html2xml would be more robust (it 
  seems to make fewer assumptions about the input html being 
  reasonably clean (e.g. ampersand entities used correctly), 
  but it's doing something peculiar so this is the fall back.

=cut 

sub html2xml_simple {
  my $ref = shift;
  # make a copy of to work on
  my $text = ${ $ref };

  # well formed xml requires a top-level entity:
  $text =~ s|^|<NODEBODY>|;
  $text =~ s|$|</NODEBODY>|;

  return \$text;  # return a reference to converted content
}

=item B<html2xml> - like html2xml_simple, but makes fewer assumptions 
  about how clean the initial html is: converts to text first, then 
  from text to xml

=cut 

sub html2xml { 
  my $ref = shift;

  return text2xml( html2text($ref) );
}


=item B<xml2text> - 
   takes df-style block of text, transforms it 
   into xml used in internal storage.

=cut 

sub xml2text { 
  my $ref = shift;
  # make a copy of to work on
  my $text = ${ $ref };

  # Strip out top level entities
  $text =~ s|<NODEBODY>||;
  $text =~ s|</NODEBODY>||;

  # Turn doomfiles jumps to xml/html jumps:
  $text =~ s|<A HREF="($DF_NODE_NAME_RULE)\.html">\1</A>|$1|msg;
  # This leaves html links alone if they don't look like DF jumps.

  deamp($text); # Has to happen last, I suspect

  return \$text;  # return a reference to converted content
}

=item B<text2html> - given reference to block of df material in raw text form
                  convert to html form (primarily, this means identifing links and 
                  changing them to HREF form).
  Example usage:

    $bodyref = text2html( $node->get_body );
    print $fh ${ bodyref };

=cut 

sub text2html {  
  my $ref = shift;

  # make a copy of the text to work on
  my $text = ${ $ref };

  # Turn doomfiles jumps to html jumps:

  #($DEBUG) && print "DF_THOUGHTS_LINK_RULE: $DF_THOUGHTS_LINK_RULE\n\n";
  # DF_THOUGHTS_LINK_RULE: (?x-ism: ([\ ]{2,})([0-9cA-Z_-]{3,})([\ ]{2,}|$) )

  $text =~ s|$DF_THOUGHTS_LINK_RULE|$1<A HREF="$2.html">$2</A>$3|msg;

  return \$text;  # return a reference to converted content
}

=item B<html2text> - inverse transform of the above.  Essentially just strips 
      out html tags.  Leaves whitespace alone (presumes this html was wrapped
      in PRE tags, and hence has significant whitespace).  It is very similar 
      to xml2text, above.

=cut

sub html2text { 

  my $ref = shift;
  # make a copy to work on
  my $text = ${ $ref };

  # Turn doomfiles jumps to xml/html jumps:
  $text =~ s|<A HREF="($DF_NODE_NAME_RULE)\.html">\1</A>|$1|msg;
  # This leaves html links alone if they don't look like DF jumps.

  deamp($text); # Has to happen last, I suspect

  return \$text;  # return a reference to converted content

}

=back 

=head2 Internally used functions

=over

=item B<ampup> - transform < > & into XML/HTML ampersand entities
  Acts directly on the given string. 
  Example:
    amup($text)

=cut 

### TODO - Switch to one of the standard ways of doing this?

sub ampup { 
  $_[0] =~ s/\&/\&amp;/g; # Note, must do this *before* the others
  $_[0] =~ s/</\&lt;/g;
  $_[0] =~ s/>/\&gt;/g;
}

=item B<deamp> - reverses the ampup transform, turns XML/HTML 
  ampersand entities back into: < > &    
  Acts directly on the given string. 
  Example:
    deamp($text)

=cut 

sub deamp { 
  $_[0] =~ s/\&lt;/</g;
  $_[0] =~ s/\&gt;/>/g;
  $_[0] =~ s/\&amp;/\&/g;         # Note, must do this *last*.
}

1;
__END__

=back

=head1 SEE ALSO

L<Project Documentation|Web::Project>

=head1 AUTHOR

Joseph Brenner, E<lt>doom@kzsu.stanford.eduE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2004 by Joseph Brenner

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.2 or,
at your option, any later version of Perl 5 you may have available.

=head1 BUGS

None reported... yet.

=cut

     

Joseph Brenner, Sat Nov 6 17:04:11 2004