downsize - HTML conversion utility 
Author Message
 downsize - HTML conversion utility

Attached is downsize, a little HTML conversion utility.

This will take an HTML document written with 3.0 and Netscape enhanced tags
and convert it to a document that is compliant with HTML 2.0. Thus, if you
have a page with lots of tables, you can use this to make a page that people
with Lynx or HotJava can view without confusion.

This is a good idea, because there are still lot's of people out there using
old browsers such as Mosaic, HotJava, SlipKnot, and Lynx. Have you ever looked
at a table under Lynx? Not much fun. What this script does is strip all the
new tags and qualifiers, convert tables to unordered list arrangements, and
(optionally) convert any "hard formatting" tags (such as Bold and Italic) to
"soft formatting" tags (such as Strong Emphasis and Emphasis).

Please look at http://www.*-*-*.com/ ~rogaski/perl/webtools.html

Send any comments, suggestions, changes, etc to me with a meaningful
subject line, thanx.

-----
#include <std_disclaimer.h>

   Doc                              "They that can give up essential liberty
   aka Mark Rogaski                  to obtain a little temporary safety

    http://www.*-*-*.com/ ~rogaski/                 -- Benjamin Franklin

---------------------------[ SNIP SNIP SNIP]------------------------------
#!/usr/bin/perl
#
# downsize - converts an HTML 3.0 or Netscape-Enhanced document into
#            a document compatible with a 2.0 Browser
#

#

# or look at http://www.*-*-*.com/ ~rogaski/perl/webtools.html
# This script may be freely modified and distributed, but please
# give me some credit :)
#
# Version 1.0 - 7 Jun 1995: Original release
#
$verinfo = "v1.0";            # Version info
$verdate = "7 Jun 1995";

$errmsg = "Usage: $0 [-o filename] [-fhsV] filename\n" .
    " -o filename      output file\n" .
    " -f               use standard input\n" .
    " -h               delimit tables with horizontal rules\n" .
    " -s               convert hard format tags to soft format tags\n" .
    " -V               version\n";

sub prep {                      # Open any necessary files

    if ($in) {
        open(INFILE,"<$infile") || die "$0: Input file cannot be opened!\n";
    } else {
        open(INFILE,"-") || die "$0: Cannot open STDIN!\n";
    }
    if ($out) {
        open(OUTFILE,">$outfile") ||
            die "$0: Output file cannot be opened!\n";
    } else {
        open(OUTFILE,">-") || die "$0: Cannot open STDOUT!\n";
    }

Quote:
}

sub htmlsplit {                 # This subroutine takes an HTML document
                                # in a single scalar and breaks it up into
                                # an array of text segments and tags

    while (<INFILE>) {
        s/\n//g;                # I didn't use chop, because I don't
                                #   want to kill any non-nl chars
                                #   ie. lastline has no nl
        s/</&&</g;                # Add some groovy control characters
        s/>/>&&/g;

    }

    $inbuf =~ s/ *&& */&&/g;            # Clean up the buffer
    $inbuf =~ s/&&&&/&&/g;
    $inbuf =~ s/\t/ /g;

                                   # and text segments
    $/ =

Quote:
}

sub softfmt {                   # Converts hard fmt tags to soft fmt tags


        s#<(/?)b( .*)?>#<$1strong>#i;
        s#<(/?)blink( .*)?>#<$1strong>#i;
        s#<(/?)u( .*)?>#<$1em>#i;
        s#<(/?)i( .*)?>#<$1em>#i;
        s#<(/?)s( .*)?>#<$1em>#i;
        s#<(/?)tt( .*)?>#<$1code>#i;
    }

Quote:
}

sub stripnew {                  # Strips out any HTML3 or Netscape tags
                                #  or qualifiers


        # These are for replacing newer formatting tags
        s#<(/?)big( .*)?>#<$1em>#i;
        s#<(/?)small( .*)?>#<$1em>#i;
        s#<(/?)sub( .*)?>#<$1var>#i;
        s#<sup( .*)?>#<em>^#i;
        s#</sup( .*)?>#</em>#i;

        # These are for removing all qualifiers
        s/<body( .*)>/<body>/i;
        s/<hr( .*)>/<hr>/i;
        s/<br( .*)>/<br>/i;
        s/<ul( .*)>/<ul>/i;
        s/<dl( .*)>/<dl>/i;
        s/<dt( .*)>/<dt>/i;
        s/<dd( .*)>/<dd>/i;
        s/<ol( .*)>/<ol>/i;
        s/<li( .*)>/<li>/i;
        s/<p( .*)>/<p>/i;

        # These are for removing tags without replacement
        s/<nobr>//i;
        s/<wbr>//i;
        s/<\/?font.*>//i;
        s/<\/?basefont.*>//i;
        s/<\/?center.*>//i;

        # These are for replacing newer control sequences
        s/&reg;?/\(R\)/ig;
        s/&copy;?/\(C\)/ig;

        # These are for removing specific qualifiers
        /<.*>/ && s/ *width=.*\b//i;
        /<.*>/ && s/ *height=.*\b//i;
        /<.*>/ && s/ *border=.*\b//i;
        /<.*>/ && s/ *vspace=.*\b//i;
        /<.*>/ && s/ *hspace=.*\b//i;
        /<.*>/ && s/ *lowsrc=.*\b//i;
        /<.*>/ &&
        s/ *align=\"?(left|right|texttop|absmiddle|baseline|absbottom)\"?//i;
    }

Quote:
}

sub cleanup {                   # Close any necessary files

    if ($in) {
        close(INFILE);
    }
    if ($out) {
        close(OUTFILE);
    }

Quote:
}

sub cvttbls {                   # Converts tables into unordered lists
                                # I'm pleased with the way headers and
                                # captions turn out, but lot's of images
                                # still don't turn out too hot



    $t_found = 1;
    while ($t_found) {          # processes the whole array separately
                                # for each table found


        undef($tbl);
        undef($head);
        undef($cap);
        undef($t_found);
        undef($t_begin);
        undef($t_end);
        undef($size);

        undef($capalign);


        for ($i = 0;$i < $#target;$i++) {
            if ($target[$i] =~ /<tr( .*)?>|<\/tr( .*)?>/i) {
                next;
            }
            if ($target[$i] =~ /<table( .*)?>/i) {
                $t_found++;
                $tbl++;
                $t_begin = $i;

                next;
            }
            if ($target[$i] =~ /<\/table( .*)?>/i) {
                undef($tbl);
                $t_end = $i;
                $size = ($t_end - $t_begin) + 1;

                last;
            }  
            if ($tbl) {
                if ($target[$i] =~ /<th( .*)?>/i) {
                    $head++;

                    next;
                }
                if ($target[$i] =~ /<\/th( .*)?>/i) {
                    undef($head);

                    next;
                }
                if ($head) {

                    next;
                }
                if ($target[$i] =~ /<caption( .*)?>/i) {
                    $cap++;
                    ($target[$i] =~ /align="?(.*)"?\b/i) && ($capalign = $1);

                    next;
                }
                if ($target[$i] =~ /<\/caption( .*)?>/i) {
                    undef($cap);

                    next;
                }
                if ($cap) {

                    next;
                }

            }
        }

            s/<table( .*)?>/<ul>/i;
            s/<\/table( .*)?>/<\/ul>/i;
            s/<td( .*)?>/<li>/i;
            s/<\/td( .*)?>/<\/li>/i;
            s/<img( .*)(src=\W+)( .*)?>/<img $2 align=\"center\">/i;
        }


                s/<caption( .*)?>/<blockquote>/i;
                s/<\/caption( .*)?>/<\/blockquote>/i;
            }
            if ($capalign =~ /top/) {

            } else {

            }
        }


                s/<th( .*)?>/<h4>/i;
                s/<\/th( .*)?>/<\/h4>/i;
            }

        }
        if ($addhr && $t_found) {


        }

    }
    for ($i = 1;$i < $#target;$i++) {
        ($target[$i] =~ /<hr>/i) && ($target[$i - 1] =~ /<hr>/i) &&
            ($target[$i] = "");
    }

Quote:
}

sub formatdoc {                 # takes an array of tags and text and returns
                                # an array of output lines formatted for
                                # printing


    $buflen = 80;               # Set this to the length of the output
                                # line - including the newline
    $tmpbuf = "";

        if ((1 + length) > $buflen) {
            $tmpbuf = $tmpbuf.$_." ";

            $tmpbuf = "";
            next;
        }
        if ((length($tmpbuf) + 1 + length) > $buflen) {

            $tmpbuf = "";
        }
        if (/<br( .*)?>|<hr( .*)?>|<!--.*-->/i) {
            $tmpbuf = $tmpbuf.$_." ";

            $tmpbuf = "";
            next;
        }
        if (/<p( .*)?>|<.l( .*)?>|<li( .*)?>|<pre( .*)?>|<body( .*)?>/i ||
            /<blockquote( .*)?>|<cite( .*)?>|<xmp( .*)?>/i) {

            $tmpbuf = "";
        }
        $tmpbuf = $tmpbuf.$_." ";
    }


Quote:
}

##################################################################

while ($_ = $ARGV[0]) {         # Parse arguments
    shift;
    /^-.*[^ofhsV].*/ && die $errmsg; # Check for non-existent flags
    /^[^-].*/ &&
        (($infile || $stdin) ? die $errmsg : (($infile = $_) && next));
    /^-o.+/ && die $errmsg;     # -o must be separate
    /^-o/ && ($outfile = shift) &&
        ($outfile =~ /-.*/) && die $errmsg; # output file
    /.*f.*/ && $stdin++ && $infile && die $errmsg; # use as a UNIX filter
    /.*h.*/ && $addhr++;        # delimit tables with <HR>
    /.*s.*/ && $soft++;         # convert hard formatting to soft formatting
    /.*V.*/ && $version++;      # current version

Quote:
}

# Check the files if necessary
if ($infile) { die "$0: Input file does not exist!\n" unless -e $infile; }
if ($outfile) { die "$0: Output file already exists!\n" if -e $outfile; }

if ($version) {                 # Just version info
    print "downsize HTML Converter $verinfo ($verdate)\n" .

Quote:
} else {                        # Everything else

    &prep($infile,$outfile);        # Open the appropriate files





    # Shameless self-promotion
    print OUTFILE
        "<!-- This document created with downsize $verinfo       -->\n";
    print OUTFILE
        "<!-- downsize is available online at:                   -->\n";
    print OUTFILE
        "<!-- http://www.*-*-*.com/ ~rogaski/perl/webtools.html -->\n";
    print OUTFILE


    &cleanup($infile,$outfile);                     # Close the files, etc.

Quote:
}



Sun, 23 Nov 1997 03:00:00 GMT  
 
 [ 1 post ] 

 Relevant Pages 

1. Perl number conversion Utilities

2. Conversion utility for MAN pages to PostScript

3. HTML::TreeBuilder and HTML conversion to XHTML

4. RTF to HTML, RTF to Text, HTML to RTF conversion

5. HTML to postscript conversion

6. Perl and document conversion to HTML ...SOLUTION?

7. Perl and document conversion to HTML

8. Feasibility of HTML to MS Word conversion?

9. Help: text to HTML Form conversion using perl

10. text to html conversion...

11. Html to text conversion

12. Html to text conversion

 

 
Powered by phpBB® Forum Software