Web::Chain project: Web/Pro/Transform.pm
package Web::Pro::Transform;
# doom@kzsu.stanford.edu
# 08 Oct 2004
=head1 NAME
Web::Pro::Transform - Perl extension for blah blah blah
=head1 SYNOPSIS
use Web::Pro::Transform qw( text2xml );
[...]
$node->set_body( text2xml( $rawtext_ref ) );
=head1 DESCRIPTION
A collection of proceedurally oriented functions for
transformations of text to and from different markup
languages.
This is a set of utilities to facilitate the Web::Chain
project (for improving "doomfiles" processing).
=head2 EXPORT
All of the following functions are OK for export:
=over
=cut
use 5.006;
use strict;
use warnings;
use Carp;
use Web::Definitions qw( $DEBUG
$DF_VERSION
$DF_THOUGHTS_LINK_RULE
$DF_NODE_NAME_RULE
);
require Exporter;
our @ISA = qw(Exporter);
our %EXPORT_TAGS = ( 'all' => [ qw( text2xml
xml2text
ampup
deamp
text2html
html2text
xml2html
html2xml
html2xml_simple
) ] );
our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
our @EXPORT = qw( );
our $VERSION = $DF_VERSION;
=item B<text2xml> - takes df-style block of text, transforms it
into xml used in internal storage.
=cut
sub text2xml {
my $ref = shift;
# make a copy of the text to work on
my $text = ${ $ref };
ampup($text); # Do this first to disambiguate chars < > & from markup
# Turn doomfiles jumps to xml/html jumps:
$text =~ s|$DF_THOUGHTS_LINK_RULE|$1<A HREF="$2.html">$2</A>$3|msg;
# well formed xml requires a top-level entity:
$text =~ s|^|<NODEBODY>|;
$text =~ s|$|</NODEBODY>|;
return \$text; # return a reference to converted content
}
=item B<xml2html> - takes content in internal xml format, transform it
into html for web publication.
=cut
sub xml2html {
my $ref = shift;
# make a copy of to work on
my $text = ${ $ref };
# Strip out top level entities
$text =~ s|<NODEBODY>||;
$text =~ s|</NODEBODY>||;
return \$text; # return a reference to converted content
}
=item B<html2xml_simple> - takes raw html content, tweaks it into internally
used xml storage format. This is currently in use
because the following html2xml displays a bug in converting
html comments into ampersand entities.
You might presume that html2xml would be more robust (it
seems to make fewer assumptions about the input html being
reasonably clean (e.g. ampersand entities used correctly),
but it's doing something peculiar so this is the fall back.
=cut
sub html2xml_simple {
my $ref = shift;
# make a copy of to work on
my $text = ${ $ref };
# well formed xml requires a top-level entity:
$text =~ s|^|<NODEBODY>|;
$text =~ s|$|</NODEBODY>|;
return \$text; # return a reference to converted content
}
=item B<html2xml> - like html2xml_simple, but makes fewer assumptions
about how clean the initial html is: converts to text first, then
from text to xml
=cut
sub html2xml {
my $ref = shift;
return text2xml( html2text($ref) );
}
=item B<xml2text> -
takes df-style block of text, transforms it
into xml used in internal storage.
=cut
sub xml2text {
my $ref = shift;
# make a copy of to work on
my $text = ${ $ref };
# Strip out top level entities
$text =~ s|<NODEBODY>||;
$text =~ s|</NODEBODY>||;
# Turn doomfiles jumps to xml/html jumps:
$text =~ s|<A HREF="($DF_NODE_NAME_RULE)\.html">\1</A>|$1|msg;
# This leaves html links alone if they don't look like DF jumps.
deamp($text); # Has to happen last, I suspect
return \$text; # return a reference to converted content
}
=item B<text2html> - given reference to block of df material in raw text form
convert to html form (primarily, this means identifing links and
changing them to HREF form).
Example usage:
$bodyref = text2html( $node->get_body );
print $fh ${ bodyref };
=cut
sub text2html {
my $ref = shift;
# make a copy of the text to work on
my $text = ${ $ref };
# Turn doomfiles jumps to html jumps:
#($DEBUG) && print "DF_THOUGHTS_LINK_RULE: $DF_THOUGHTS_LINK_RULE\n\n";
# DF_THOUGHTS_LINK_RULE: (?x-ism: ([\ ]{2,})([0-9cA-Z_-]{3,})([\ ]{2,}|$) )
$text =~ s|$DF_THOUGHTS_LINK_RULE|$1<A HREF="$2.html">$2</A>$3|msg;
return \$text; # return a reference to converted content
}
=item B<html2text> - inverse transform of the above. Essentially just strips
out html tags. Leaves whitespace alone (presumes this html was wrapped
in PRE tags, and hence has significant whitespace). It is very similar
to xml2text, above.
=cut
sub html2text {
my $ref = shift;
# make a copy to work on
my $text = ${ $ref };
# Turn doomfiles jumps to xml/html jumps:
$text =~ s|<A HREF="($DF_NODE_NAME_RULE)\.html">\1</A>|$1|msg;
# This leaves html links alone if they don't look like DF jumps.
deamp($text); # Has to happen last, I suspect
return \$text; # return a reference to converted content
}
=back
=head2 Internally used functions
=over
=item B<ampup> - transform < > & into XML/HTML ampersand entities
Acts directly on the given string.
Example:
amup($text)
=cut
### TODO - Switch to one of the standard ways of doing this?
sub ampup {
$_[0] =~ s/\&/\&/g; # Note, must do this *before* the others
$_[0] =~ s/</\</g;
$_[0] =~ s/>/\>/g;
}
=item B<deamp> - reverses the ampup transform, turns XML/HTML
ampersand entities back into: < > &
Acts directly on the given string.
Example:
deamp($text)
=cut
sub deamp {
$_[0] =~ s/\</</g;
$_[0] =~ s/\>/>/g;
$_[0] =~ s/\&/\&/g; # Note, must do this *last*.
}
1;
__END__
=back
=head1 SEE ALSO
L<Project Documentation|Web::Project>
=head1 AUTHOR
Joseph Brenner, E<lt>doom@kzsu.stanford.eduE<gt>
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2004 by Joseph Brenner
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.2 or,
at your option, any later version of Perl 5 you may have available.
=head1 BUGS
None reported... yet.
=cut
Joseph Brenner,
Sat Nov 6 17:04:11 2004