from __future__ import generators #------------------------------------------------------------------------------- # # Convert an HTML file generated by Microsoft Word into one or more HTML files. # # It includes the ability to: # - Create an external CSS style sheet used by all generated pages. # - Automatically insert default or custom navigation links at the top and # bottom of each page. # - Generate Microsoft Help Compiler compatible .hhc (table of contents), # .hhp (project), .stp (stop word list) and .chm (compiled help module) # files. # # Written by: David C. Morrill # # Date: 07/07/2003 # # (c) Copyright 2003 by David C. Morrill # #------------------------------------------------------------------------------- # # To Do: # - Allow option to define your own document template to replace the default # tags. # - Allow option to use page titles as file names. # - Allow defining a custom, external document body processing function. # - Allow defining a custom external function that receives all page titles # and URL's, for generating custom navigation aids. This option could be # used to: # - Generate a table of contents page. # - Generate Javascript pop-up navigation menu. # (Maybe the previous two items should be combined into a single external # class with optional methods that can be invoked, like a delegate. The # option should allow multiple delegates). # - 'YakBack' related items: # - Allow e-mail to be sent on each new post. # - Allow priviledged user(s) to delete comments. # - Allow suppressing sections with no comments on log page. # - Worry about file locking during updates? # - Issues on non-IE browsers: # - Mozilla under Red Hat: Click in 'Show tags' mode locks it in 'off' # mode instead on 'on' mode. Links are underlined in blue. # - Mac OS X Safari: 'Save preferences' link only runs 'false' script. # #------------------------------------------------------------------------------- #------------------------------------------------------------------------------- # Imports: #------------------------------------------------------------------------------- import sys import re import os.path import binascii from getopt import getopt, GetoptError from exceptions import ValueError from os import system from time import time, asctime #------------------------------------------------------------------------------- # Constants: #------------------------------------------------------------------------------- # Last heading level processed: LAST_HEADING = 9 # Special CSS styles: special_styles = [ 'a:link', 'a:visited', 'a:active', 'a:hover' ] # Ignore CSS styles: ignore_styles = [ '@page' ] # Standard HTML file extensions: html_exts = [ '.html', '.htm' ] # Parsing regular expressions: tag_pat = re.compile( r'<(.*?)>| |\r', re.DOTALL | re.IGNORECASE ) title_pat = re.compile( r'(.*?)', re.DOTALL | re.IGNORECASE ) style_pat = re.compile( r'', re.DOTALL | re.IGNORECASE ) body_pat = re.compile( r']*?)>(.*)', re.DOTALL | re.IGNORECASE ) comment_pat = re.compile( r'/\*(.*?)\*/', re.DOTALL | re.IGNORECASE ) fontface_pat = re.compile( r'\s@font-face\s+?{\s*?font-family:\s*?"?(.*?)"?;(.*?)}', re.DOTALL | re.IGNORECASE ) fontfamily_pat = re.compile( r'''font-family\s*:\s*['"]?([a-zA-Z0-9 ]*)''', re.DOTALL | re.IGNORECASE ) class_pat = re.compile( r'\sclass=([a-zA-Z0-9_]*)\s', re.DOTALL | re.IGNORECASE ) instyle_pat = re.compile( r'''\sstyle=(['"])(.*?)\1''', re.DOTALL | re.IGNORECASE ) ahref_pat = re.compile( r'(.*?)', re.DOTALL | re.IGNORECASE ) aname_pat = re.compile( r'(.*?)', re.DOTALL | re.IGNORECASE ) br_pat = re.compile( r'
|', re.DOTALL | re.IGNORECASE ) yakback_pat = re.compile( r'(?:

|)(.*?)

|(?:
  • |)(.*?)
  • ', re.DOTALL | re.IGNORECASE ) doc_pat = re.compile( r'\$DOC_[a-zA-Z0-9]+\$', re.DOTALL ) page_pat = re.compile( r'\$PAGE_[a-zA-Z0-9]+\$', re.DOTALL ) #------------------------------------------------------------------------------- # 'pydoh' class: # - Read the input HTML file. # - Parse the interesting header and body information. # - Build the global style sheet. # - Break the document into sub-documents at heading levels. # - Adjust tags to contain cross-file links. # - Generate a .hhc file (if requested). # - Generate the individual page files (with optional navigation links). #------------------------------------------------------------------------------- class pydoh: #---------------------------------------------------------------------------- # Class constants: #---------------------------------------------------------------------------- VERSION = '0.4' DATE = '08/02/2003' # String of short form command line options: short_options = 'a:bBc:d:e:g:h:kl:mn:o:qs:StTvx:X:y' # List of long form command line options: long_options = [ 'author=', 'bottom=', 'nobottom', 'toc=', 'body=', 'email=', 'taglen=', 'hhc=', 'heading=', 'label=', 'chm', 'navbar=', 'output=', 'quiet', 'css=', 'nocss', 'top=', 'notop', 'version', 'omit=', 'ext=', 'yakback' ] #---------------------------------------------------------------------------- # Default page navigation templates: #---------------------------------------------------------------------------- # Default top of page navigation links: DEFAULT_TOP_LINKS = """
    $DOC_TITLE$
    $IF_PREV$Previous: $PREV_TITLE$$ELSE$ $END$ Table of Contents $IF_NEXT$Next: $NEXT_TITLE$$ELSE$ $END$
    """ SHORT_DEFAULT_TOP_LINKS = """
    """ # Default bottom of page navigation links: DEFAULT_BOTTOM_LINKS = """
    $IF_PREV$Previous: $PREV_TITLE$$ELSE$ $END$ Table of Contents $IF_NEXT$Next: $NEXT_TITLE$$ELSE$ $END$
    $DOC_TITLE$
    """ SHORT_DEFAULT_BOTTOM_LINKS = """
    """ #---------------------------------------------------------------------------- # Default CSS styles to be added: #---------------------------------------------------------------------------- default_styles = """ a:link, a:visited {color:#000000;text-decoration:none} a:active, a:hover {color:#B00000;text-decoration:underline} """ #---------------------------------------------------------------------------- # CSS comment template: #---------------------------------------------------------------------------- css_comment = """ /*----------------------------------------------------------------------------*/ /* */ /* %s*/ /* */ /*----------------------------------------------------------------------------*/ """ #---------------------------------------------------------------------------- # HTML document template: #---------------------------------------------------------------------------- html_template = """$HTML_PREFIX$ $PAGE_TITLE$ $HEAD_TAGS$ $BODY_PREFIX$$BODY$ """ #---------------------------------------------------------------------------- # Microsoft Windows Help file templates: #---------------------------------------------------------------------------- # Project file template: hhp_body_template = """[OPTIONS] Compatibility=1.1 or later Compiled file=%s.chm Contents file=%s.hhc Default topic=%s_1.html Display compile progress=No Full text search stop list file=%s.stp Full-text search=Yes Language=0x409 English (United States) Title=%s [FILES] %s_1.html [INFOTYPES] """ # Table of contents file template: hhc_body_template = """
      %s
    """ # Table of contents element template: hhc_element_template = """
  • """ # Standard 'stop word' list used for Help Workshop: stop_list = """a,about,after,against,all,also,among,an,and,are,as,at,be became,because,been,between,but,by,can,come,do,during,each,early,for,form,found from,had,has,have,he,her,his,however,in,include,into,is,it,its,late,later,me,med made,many,may,more,most,near,no,non,not,of,on,only,or,other,over,several,she some,such,than,that,the,their,then,there,these,they,this,through,to,under,until use,was,we,were,when,where,which,who,with,you""" #---------------------------------------------------------------------------- # 'YakBack' related information: #---------------------------------------------------------------------------- # YakBack file header: yakback_header = """ """ # YakBack extra section information: yakback_head_extra = ('\n') # YakBack 'No comment' tag icon: yb_gif = """ 4749463839610C000900800100000000FFFFFF21F90401E80301002C000000000C0009000002118C 0DA970B8D9C2836DB23AA3D539FA1F15003B """ # YakBack 'Has comments' tag icon: yb1_gif = """ 4749463839610C000900800100C40000FFFFFF21F90401E80301002C000000000C0009000002118C 0DA970B8D9C2836DB23AA3D539FA1F15003B """ # YakBack 'Open' button icon: ybdown_gif = """ 47494638396117000C00C41300C5C5C5FFFFFFEDEDEDDFDFDFD8D8D8B4B4B4E3E3E3F2F2F29F9F9F 8E8E8E000000D1D1D1ABABAB8D8D8D828282878787BEBEBE959595838383FFFFFF00000000000000 000000000000000000000000000000000000000000000000000000000021F90401E80313002C0000 000017000C00000549E0048C64699E62200C6CEBBE2D5100C601DF2F8000ECE99B89046F3052188F 48E4283824259F0A12B35782464B53E24979CAFE8C3FEF6FCC5C3082E8B43AED784C2091B55CED98 8400003B """ # YakBack 'Close' button icon: ybup_gif = """ 47494638396117000C00C41300C5C5C5FFFFFFE3E3E3DFDFDFCFCFCFBFBFBFEDEDEDF2F2F2DCDCDC CCCCCC9696968E8E8E000000D8D8D8828282858585B3B3B39E9E9E878787FFFFFF00000000000000 000000000000000000000000000000000000000000000000000000000021F90401E80313002C0000 000017000C00000548E0048C64699E62200C6CEBBE2D5100C601DF6FA200ECE99B8B056FF063307E C1A1CF68F4257B25A6F4587A1247D32C1560C56AB3A3EE6F9C447083E8B41AED784C2091B55CED0E 01003B """ #---------------------------------------------------------------------------- # YakBack CSS styles: #---------------------------------------------------------------------------- yakback_styles = """ .yb_mode { font-family:Arial,Helvetica,sans-serif; font-size:8pt; border:2px solid #C00000; background-color:#D0F0D0; padding:0 0 2 0; margin-top:4px; margin-bottom:4px; } .yb_td { font-family:Arial,Helvetica,sans-serif; font-size:8pt; } .yb_help { font-family:Arial,Helvetica,sans-serif; border-top:1px solid black; background-color:#FFFFFF; padding:6 10; margin: 2 0 0 0; } .yb_h1 { font-family:Arial,Helvetica,sans-serif; font-size:12pt; font-weight:bold; border:1px solid black; padding:2 6; background-color:#C0E0FF; margin:4 0; } .yb_p { font-family:Arial,Helvetica,sans-serif; font-size:10pt; margin:8 0; } .yb_ul { font-family:Arial,Helvetica,sans-serif; font-size:10pt; margin:0 6 0 30; } .yb_title { font-family:Arial,Helvetica,sans-serif; font-size:12pt; font-weight:bold; border:1px solid black; padding:3; background-color:#C0E0FF; margin:4 0; text-align:center; } .yb_timelog { font-family:Arial,Helvetica,sans-serif; font-size:10pt; padding:0; margin:0; } .yb_pagelog { font-family:Arial,Helvetica,sans-serif; font-size:10pt; border:1px solid #505050; padding:3 0; background-color:#E0E0E0; margin:2 0; } .yb_dropdown { background-color:#E0E0E0; padding:3 7 1 7; margin:0; } .yb_block { font-family:Arial,Helvetica,sans-serif; font-size:10pt; border:2px solid #C00000; background-color:#E8E8E8; padding:0 3; margin-top:4px; margin-bottom:4px; } .yb_sig { font-size:8pt; border-left:1px solid #505050; border-right:1px solid #505050; border-top:1px solid #505050; background-color:#D0F0D0; padding:2 6; margin-top:3px; margin-bottom:0px; } .yb_number { font-weight:normal; } .yb_time { font-weight:normal; } .yb_name { font-weight:bold; } .yb_ip { font-weight:normal; } .yb_comment { border-top:1px solid #B0B0B0; border-left:1px solid #505050; border-right:1px solid #505050; border-bottom:1px solid #505050; background-color:#FFFFFF; padding:2 6; margin-top:0px; margin-bottom:3px; } .yb_author { border-top:1px solid #B0B0B0; border-left:1px solid #505050; border-right:1px solid #505050; border-bottom:1px solid #505050; background-color:#F0F8C8; padding:2 6; margin-top:0px; margin-bottom:3px; } .yb_form { border:1px solid black; background-color:#C0E0FF; padding:4 6 6 6; margin-top:3px; margin-bottom:3px; } .yb_wrap { padding:0; margin:0; } .yb_button { margin-top:4; } form { padding:0; margin:0 } """ #---------------------------------------------------------------------------- # YakBack Javascript support file: #---------------------------------------------------------------------------- yakback_js = """ var yb_form = null; var yb_button = null; var yb_label = null; function yb_form_toggle ( index ) { var element = document.getElementById( "yb_f" + index ); if ( element.style.display != "block" ) { element.style.display = "block"; if ( yb_form != null ) { yb_form.style.display = "none"; yb_label.style.display = "inline"; yb_button.src = "ybdown.gif"; } yb_form = element; yb_button = document.getElementById( "yb_b" + index ); yb_button.src = "ybup.gif"; yb_label = document.getElementById( "yb_l" + index ); yb_label.style.display = "none"; } else { element.style.display = "none"; yb_button.src = "ybdown.gif"; yb_label.style.display = "inline"; yb_form = yb_button = yb_label = null; } } function yb_help_toggle ( ) { var help = document.getElementById( "yb_help" ); if ( help.style.display == "block" ) { help.style.display = "none"; document.getElementById( "yb_helpb" ).src = "ybdown.gif"; } else { help.style.display = "block"; document.getElementById( "yb_helpb" ).src = "ybup.gif"; } } function yb_toggle ( index ) { var element = document.getElementById( "yb_c" + index ); if ( !yb_locked && (element.yb_locked == false) ) { element.yb_locked = true; return; } if ( element.style.display == "none" ) { element.style.display = "block"; } else { element.style.display = "none"; element.yb_locked = false; } } function yb_show ( index ) { var element = document.getElementById( "yb_c" + index ); if ( !yb_locked && (element.yb_locked != true) ) { element.style.display = "block"; element.yb_locked = false; } } function yb_hide ( index ) { var element = document.getElementById( "yb_c" + index ); if ( !yb_locked && (element.yb_locked != true) ) { element.style.display = "none"; element.yb_locked = false; } } function yb_HideAll ( ) { var elements = document.getElementsByName( "yb_tag" ); for ( i = 0; i < elements.length; i++ ) { elements[i].style.display = "none"; } for ( i = 0; i < elements.length; i++ ) { document.getElementById( "yb_c" + i ).style.display = "none"; } document.getElementById( 'new_mode' ).value = "hidden"; } function yb_ShowTags ( ) { yb_locked = false; var elements = document.getElementsByName( "yb_tag" ); for ( i = 0; i < elements.length; i++ ) { elements[i].style.display = "inline"; } for ( i = 0; i < elements.length; i++ ) { element = document.getElementById( "yb_c" + i ); element.style.display = "none"; element.yb_locked = null; } document.getElementById( 'new_mode' ).value = "none"; } function yb_ShowComments ( ) { yb_locked = true; var elements = document.getElementsByName( "yb_tag" ); var show = new Array( elements.length ); for ( i = 0; i < elements.length; i++ ) { elements[i].style.display = "inline"; var src = elements[i].src; show[i] = (src.substring( src.length - 7, src.length ) == "yb1.gif"); } for ( i = 0; i < elements.length; i++ ) { element = document.getElementById( "yb_c" + i ); element.style.display = (show[i]? "block": "none"); element.yb_locked = true; } document.getElementById( 'new_mode' ).value = "block"; } """ #---------------------------------------------------------------------------- # YakBack PHP server module: #---------------------------------------------------------------------------- yakback_php = r"""
    #: At , at said:
    ">
     ...click to add a new comment.
    Type your comment here:

    Your name:    
    $items ) { $link_tag = "{$link}#yb_t{$index}"; foreach ( $items as $item ) { $item[] = $link_tag; $comments[] = $item; } } } return; } $index = $page - 1; $display = ($mode == "hidden")? "none": "inline"; $gif = ($info != NULL)? "yb1": "yb"; ?>
     
    $items ) { $link_tag = "{$link}#yb_t{$index}"; foreach ( $items as $item ) { $item[] = $link_tag; $all_items[] = $item; } } usort( $all_items, 'time_sort' ); $num = 1; foreach ( $all_items as $item ) { list( $name, $ip, $timestamp, $when, $comment, $link ) = $item; $name = htmlspecialchars( $name ); $comment = htmlspecialchars( $comment ); ?>
    #: At , at said:
    ">
    "; } function time_sort ( $l, $r ) { $tl = $l[2]; $tr = $r[2]; if ( $tl == $tr ) { return 0; } return (($tl > $tr)? -1: 1); } function ybend ( ) { global $order, $comments, $author; if ( $order == "time" ) { print "
    "; if ( count( $comments ) == 0 ) { print "No comments on this document."; } else { usort( $comments, 'time_sort' ); $num = 1; foreach ( $comments as $item ) { list( $name, $ip, $timestamp, $when, $comment, $link ) = $item; $name = htmlspecialchars( $name ); $comment = htmlspecialchars( $comment ); ?>
    #: At , at said:
    ">
    "; } } function ybinit ( ) { global $mode; $here = $_SERVER[ 'PHP_SELF' ]; $log = substr( $here, 0, strrpos( $here, '_' ) ); ?>
      YakBack:   >Hide tags   >Show tags   >Show comments       Save preferences       View page log       View time log  ...click for help.  
    """ #---------------------------------------------------------------------------- # Usage description: #---------------------------------------------------------------------------- usage_info = """Usage is: python pydoh.py [options] input_file where: input_file = Name of HTML file to process (.htm or .html extension can be omitted). Valid options are: -a name = Specifies the 'name' of the author of the document. Used --author=name to highlight YakBack author responses. -b = Add navigation links to the bottom of each page. --bottom=file If '-b' is specified (the default), default formatted links are added to the bottom of each page. If '--bottom' is specified, 'file' contains the HTML template used to generate the navigation links. -B = Suppresses generation of navigation links at the --nobottom bottom of each page. -c n = Specifies the number of the page that the $TOC$ --toc=n template variable should refer to (default is 1). -d text = Specifies replacement or additional information to be --body=text added to each document's tag. If the first three characters of 'text' are '...', the remainder of 'text' is added to the original tag; otherwise 'text' replaces the original tag contents. -e address = Send e-mail to the address specified by 'addresss' --email address whenever a 'YakBack' comment is posted by someone. -g n = Specifies the minimum length of a YakBack enabled --taglen=n document section (default length is 80). -h n = Specifies the maximum heading level 'n' (1..9) used to --heading=n define generated file boundaries. By default, file splits occur at

    tags. -k = Specifies that a Microsoft Help Compiler compatible --hhc=file 'table of contents (i.e. '.hhc' file) should be generated. If '-k' is specified, the output file root prefix with a '.hhc' extension is used. If '--hhc' is specified, 'file' with a '.hhc' extension is used. -l text = Specifies the document label (i.e. title). If not --label=text specified, it defaults to the tag value of the specified input file. -m = Automatically generate the Windows Help (.chm) file --chm from the generated HTML files. -n file = Specifies a file containing HTML defining a site --navbar=file navigation bar. -o file = Specifies the output file root prefix. Each generated --output=file file will append '_n' to the specified root prefix to arrive at the final file name. The default output file root prefix is the same as the input file name. -q = Do not display informational messages. --quiet -s = Specifies additional CSS styles to be added to the --css file generated style sheet. If '-s' is specified (the default), the standard 'pydoh' CSS '<A>' link styles are used. If '--css' is specified, 'file' is a file containing the style rules to be added. -S = Do not add any additional CSS style rules beyond those --nocss specified in the main input HTML file. -t = Add navigation links to the top of each page. --top=file If '-t' is specified (the default), default formatted links are added to the top of each page. If '--top' is specified, 'file' contain the HTML template used to generate the navigation links. -T = Suppresses generation of navigation links at the --notop top of each page. -v = Display 'pydoh' version. --version -x n1,n2,... = Omit the pages numbered 'n1,n2,...' (i.e. one or more --omit=n1,n2,... page numbers separated by commas). -X ext = Specifies the file extension 'ext' to use for each --ext ext generated file (defaults to 'php' if YakBack is used, and 'html' otherwise). -y = Add 'YakBack' support to the generated files (which --yakback will have a '.php' extension instead of '.html'). """ #---------------------------------------------------------------------------- # Initialize the object: #---------------------------------------------------------------------------- def __init__ ( self, argv = None ): self.start_time = time() self.initialize() if argv is None: argv = sys.argv[1:] self.process_options( argv ) self.process_input() #---------------------------------------------------------------------------- # Initialize default values: #---------------------------------------------------------------------------- def initialize ( self ): self.top_links = self.DEFAULT_TOP_LINKS self.bottom_links = self.DEFAULT_BOTTOM_LINKS self.max_heading = 1 self.toc_page = 1 self.toc_root = None self.root = '' self.file_ext = None self.body_tag = '...' self.navbar = '$BODY$' self.title = None self.omit_list = [] self.css = '' self.yakback = False self.email = '' self.author = '' self.taglen = None self.gen_chm = False self.quiet = False self.files_written = 0 #---------------------------------------------------------------------------- # Parse the options to be used: #---------------------------------------------------------------------------- def process_options ( self, argv ): try: options, args = getopt( argv, self.short_options, self.long_options ) for option, value in options: self.process_option( option, value ) except GetoptError, excp: self.usage( str( excp ) ) except SystemExit: raise except: self.usage( 'Invalid option: %s %s' % ( option, value ) ) # Verify that the right number of file arguments was specified: if len( args ) == 0: self.usage( 'No input file specified' ) if len( args ) > 1: self.usage( 'Only one input file should be specified' ) # Make sure that the correct default top/bottom links are used: if ((self.bottom_links == '') and (self.top_links == self.DEFAULT_TOP_LINKS)): self.bottom_links = self.SHORT_DEFAULT_BOTTOM_LINKS elif ((self.top_links == '') and (self.bottom_links == self.DEFAULT_BOTTOM_LINKS)): self.top_links = self.SHORT_DEFAULT_TOP_LINKS # Set up the output .HTML and Window Help file prefixes to use: self.filename = args[0] if self.root == '': self.root = os.path.splitext( self.filename )[0] if ((self.toc_root == '') or (self.gen_chm and (self.toc_root is None))): self.toc_root = self.root # Check for options with prerequisites: if ((self.email != '') or (self.author != '') or (self.taglen is not None)): self.yakback = True if self.yakback and (self.taglen is None): self.taglen = 80 # Check for conflicting options: if self.yakback and (self.toc_root is not None): print ('The --yakback option conflicts with the --hhc or --chm ' 'option.\nThe --hhc and/or --chm options are being ignored.') self.toc_root = None self.gen_chm = False # Finalize the extra CSS information: if self.css is None: self.css = '' elif self.css == '': self.css = self.default_styles #---------------------------------------------------------------------------- # Process a single command line option and value: #---------------------------------------------------------------------------- def process_option ( self, option, value ): if (option == '-a') or (option == '--author'): self.author = ',%s,' % ( ','.join( [ x.strip() for x in value.split( ',' ) ] ) ) elif option == '-b': self.bottom_links = self.DEFAULT_BOTTOM_LINKS elif option == '--bottom': self.bottom_links = self.read_file( value, html_exts ) elif option == '-t': self.top_links = self.DEFAULT_TOP_LINKS elif option == '--top': self.top_links = self.read_file( value, html_exts ) elif (option == '-B') or (option == '--nobottom'): self.bottom_links = '' elif (option == '-T') or (option == '--notop'): self.top_links = '' elif (option == '-n') or (option == '--navbar'): self.navbar = self.read_file( value, html_exts ) elif (option == '-c') or (option == '--toc'): self.toc_page = int( value ) elif (option == '-d') or (option == '--body'): self.body_tag = value elif (option == '-e') or (option == '--email'): self.email = value elif (option == '-g') or (option == '--taglen'): self.taglen = int( value ) elif (option == '-k') or (option == '--hhc'): self.toc_root = value elif (option == '-l') or (option == '--label'): self.title = value elif (option == '-m') or (option == '--chm'): self.gen_chm = True elif (option == '-h') or (option == '--heading'): self.max_heading = int( value ) if not (0 <= self.max_heading <= LAST_HEADING): raise ValueError elif (option == '-o') or (option == '--output'): self.root = value elif (option == '-q') or (option =='--quiet'): self.quiet = True elif option == '-s': self.css = (self.css or '') + self.default_styles elif option == '--css': self.css = ((self.css or '') + self.read_file( value, [ '.css' ] )) elif (option == '-S') or (option == '--nocss'): self.css = None elif (option == '-v') or (option == '--version'): print 'pydoh version %s (%s)' % ( self.VERSION, self.DATE ) sys.exit(0) elif (option == '-x') or (option == '--omit'): self.omit_list.extend( [ int( x ) for x in eval( '[%s]' % value ) ]) elif (option == '-X') or (option == '--ext'): value = value.replace( '.', '' ); if value != '': self.file_ext = value elif (option == '-y') or (option == '--yakback'): self.yakback = True #---------------------------------------------------------------------------- # Process the input file: #---------------------------------------------------------------------------- def process_input ( self ): self.gen_time = asctime() self.gen_msg = 'Generated by pydoh version %s on %s' % ( self.VERSION, self.gen_time ) self.max_page = 0 self.next_style = 0 self.htmldocs = [] self.overrides = [] self.class_tags = {} self.styles = {} self.links = links = {} self.hrefs = hrefs = {} self.base_path, self.base_root = os.path.split( self.root ) # Select the correct file extension and header information to generate: if self.yakback: self.file_ext = self.file_ext or 'php' self.html_header = self.yakback_header % ( self.email, self.author or 'the document author' ) self.body_header = '<?php ybinit(); ?>\n'; self.head_extra = self.yakback_head_extra self.write_file( os.path.join( self.base_path, 'yakback.js' ), self.yakback_js.strip() ) self.write_file( os.path.join( self.base_path, 'yakback.php' ), self.yakback_php.strip() ) self.write_hex_file( 'yb.gif', self.yb_gif ) self.write_hex_file( 'yb1.gif', self.yb1_gif ) self.write_hex_file( 'ybdown.gif', self.ybdown_gif ) self.write_hex_file( 'ybup.gif', self.ybup_gif ) else: self.file_ext = self.file_ext or 'html' self.html_header = self.body_header = '' self.head_extra = '' # Normalize the list of pages to be omitted: omit_list = self.omit_list omit_list.sort() while (len( omit_list ) > 0) and (omit_list[0] < 1): del omit_list[0] # Parse the information we need from the HTML input file: self.parse_html( self.filename ) # Build the optimized style sheet information: self.build_stylesheet() # Create the external css stylesheet: self.gen_stylesheet() # Split the base document into sub-documents at "<Hn>' tags: self.split_doc() # Gather up all of the documents in sorted page order: self.docs = docs = [ htmldoc for htmldoc in self.all_documents() ] docs.sort( lambda x, y: cmp( x.page, y.page ) ) # Generate the page links for all pages: for htmldoc in docs: htmldoc.gen_href() # Find all the valid 'hrefs' and catalog them: for htmldoc in docs: htmldoc.parse_hrefs( hrefs ) # Find all the targets actually used and note their page: for htmldoc in docs: htmldoc.parse_targets( hrefs, links ) # Patch all the cross-page links to point to the correct page: for htmldoc in docs: htmldoc.patch_hrefs( links ) # Perform any 'yakback' processing needed on the documents: if self.yakback: for htmldoc in docs: htmldoc.gen_yakback() # Provide a 'hook' to process all pages at once before starting # the page generation loop: self.process_pages() # Generate all of the individual HTML pages: for htmldoc in docs: self.gen_html( htmldoc ) # Generate the Windows Help files if requested: if self.toc_root is not None: self.gen_help() # Generate the YakBack comment log if YakBack support requested: if self.yakback: self.gen_yakback_log() # Display statistics (if requested): if not self.quiet: print '%d file%s generated in %.2f seconds.' % ( self.files_written, ' s'[ self.files_written != 1 ], (time() - self.start_time) ) # Compile the Windows Help (.chm) file if requested: if self.gen_chm: if self.quiet: system( 'hhc %s.hhp >nul' % self.toc_root ) else: print '\nCompiling the Windows Help file...\n' system( 'hhc %s.hhp' % self.toc_root ) #---------------------------------------------------------------------------- # Display usage information and terminate: #---------------------------------------------------------------------------- def usage ( self, msg = '' ): if msg != '': print '\nERROR: %s\n' % msg print self.usage_info sys.exit(1) #---------------------------------------------------------------------------- # Strip leading and trailing quotes from a string: #---------------------------------------------------------------------------- def strip_quotes ( self, string ): try: while string[:1] in "'\"": string = string[1:] while string[-1:] in "'\"": string = string[:-1] except: pass return string #---------------------------------------------------------------------------- # Create a normalized style from a style definition string: #---------------------------------------------------------------------------- def normalized_style ( self, style ): if style[-1] != ';': style += ';' style_dic = {} for item in style.split( ';' ): if item != '': name, value = item.split( ':' ) style_dic[ name.strip() ] = value.strip() keys = style_dic.keys() keys.sort() return ';'.join( [ '%s:%s' % ( key, style_dic[ key ] ) for key in keys ]) #---------------------------------------------------------------------------- # Read a specified input file (with optional extensions): #---------------------------------------------------------------------------- def read_file ( self, filename, extensions ): fh = None try: fh = open( filename, 'r' ) except: base, ext = os.path.splitext( filename ) if ext == '': for ext in extensions: try: fh = open( filename + ext, 'r' ) break except: pass if fh is None: print 'Unable to open input file:', filename sys.exit( 1 ) data = fh.read() fh.close() return data #---------------------------------------------------------------------------- # Write a specified file containing a specified data string: #---------------------------------------------------------------------------- def write_file ( self, filename, data, mode = 'w' ): fh = open( filename, mode ) fh.write( data ) fh.close() self.files_written += 1 #---------------------------------------------------------------------------- # Parse the interesting header and body information: #---------------------------------------------------------------------------- def parse_html ( self, filename ): if os.path.splitext( filename )[1].lower() == '.doc': filename = self.convert_doc( filename ) html = self.read_file( filename, html_exts ) if self.title is None: match = title_pat.search( html ) if match is not None: self.title = match.group(1) else: self.title = 'No title specified' self.style = ''.join( style_pat.findall( html ) ) match = body_pat.search( html ) self.doc = match.group(2) if self.body_tag[:3] == '...': self.body_tag = ('%s %s' % ( match.group(1).strip(), self.body_tag[3:].strip() )).strip() if self.body_tag != '': self.body_tag = ' ' + self.body_tag #---------------------------------------------------------------------------- # Convert a '.doc' file to a '.html' file: #---------------------------------------------------------------------------- def convert_doc ( self, filename ): base, ext = os.path.splitext( filename ) html_filename = base + '.html' try: import win32com.client word = win32com.client.Dispatch( 'Word.Application' ) count = word.Documents.Count word.Documents.Open( FileName = filename, ConfirmConversions = False, ReadOnly = False, AddToRecentFiles = False, PasswordDocument = "", PasswordTemplate = "", Revert = False, WritePasswordDocument = "", WritePasswordTemplate = "", Format = 0 ) # wdOpenFormatAuto word.ActiveDocument.SaveAs( FileName = html_filename, FileFormat = 10, # wdFormatFilteredHTML LockComments = False, Password = "", AddToRecentFiles = True, WritePassword = "", ReadOnlyRecommended = False, EmbedTrueTypeFonts = False, SaveNativePictureFormat = False, SaveFormsData = False, SaveAsAOCELetter = False ) if count < word.Documents.Count: word.ActiveDocument.Close( SaveChanges = False ) if word.Documents.Count == 0: word.Application.Quit( SaveChanges = False ) except: raise return html_filename #---------------------------------------------------------------------------- # Build the global style sheet: #---------------------------------------------------------------------------- def build_stylesheet ( self ): style = self.style doc = self.doc # Build the list of '@font-face' entries that are actually referenced: ff = {} for name, value in fontface_pat.findall( style ): ff[ self.strip_quotes( name ) ] = value.strip() self.font_faces = ff2 = {} for name in fontfamily_pat.findall( doc ): name = self.strip_quotes( name ) try: ff2[ name ] = ff[ name ] except: pass # Build a dictionary of all defined styles: style = fontface_pat.sub( '', style ) style = comment_pat.sub( '', style ) style = style.replace( '<!--', '' ).replace( '-->', '' ).strip() self.add_styles( self.css ) if self.yakback: self.add_styles( self.yakback_styles ) self.add_styles( style ) # Replace all inline 'style=' references with 'class=' references: s2c = self.style_to_class self.doc = s2c( self.doc ) self.top_links = s2c( self.top_links ) self.bottom_links = s2c( self.bottom_links ) self.navbar = s2c( self.navbar ) #---------------------------------------------------------------------------- # Add all of the CSS styles contained in a specified string to the style # dictionaries, ignoring any styles with names in a specified list: #---------------------------------------------------------------------------- def add_styles ( self, style ): class_tags = self.class_tags styles = self.styles for item in style.split( '}' ): col = item.find( '{' ) if col >= 0: names = item[:col].strip() norm_style = self.normalized_style( item[ col + 1: ].strip() ) class_tags2 = styles.get( norm_style, None ) if class_tags2 is None: styles[ norm_style ] = class_tags2 = [] for name in names.split( ',' ): name = name.strip() if name[-2:] == '.*': name = name[:-2].lower() self.overrides.append( name ) if ((name.split( ' ' )[0].lower() not in ignore_styles) and (not class_tags.has_key( name ))): class_tags[ name ] = norm_style if name.find( '.' ) < 1: class_tags2.append( name ) #---------------------------------------------------------------------------- # Analyze and replace all inline 'style' references with 'class' # references in a specified document by building a 'super style sheet' that # merges all inline styles with new or existing styles, so that the # resulting document only uses style classes, with no inline styles: #---------------------------------------------------------------------------- def style_to_class ( self, doc ): styles = self.styles class_tags = self.class_tags overrides = self.overrides new_doc = [] col3 = 0 while True: col1 = doc.find( '<', col3 ) if col1 < 0: break col2 = doc.find( '>', col1 ) if col2 < 0: break body = doc[ col1 + 1: col2 ].replace( '\n', ' ' ).replace( '\r', '' ).strip() + ' ' tag = body[ : body.find( ' ' ) ] cls = '' match = class_pat.search( body ) if match is not None: cls = match.group(1) body = class_pat.sub( ' ', body ) name = '%s.%s' % ( tag, cls ) match = instyle_pat.search( body ) if match is None: if cls == '': new_doc.append( doc[ col3: col2 + 1 ] ) col3 = col2 + 1 continue try: style = class_tags[ name ] except: style = '' else: body = instyle_pat.sub( '', body ) style = match.group(2) if cls != '': try: style = '%s;%s' % ( class_tags[ name ], style ) except: pass if (style != '') and (tag.lower() not in overrides): cls = '' tagdot = tag + '.' n = len( tagdot ) norm_style = self.normalized_style( style ) class_tags2 = styles.get( norm_style, None ) if class_tags2 is None: styles[ norm_style ] = class_tags2 = [] else: match = [ x for x in class_tags2 if x[:n] == tagdot ] if len( match ) > 0: name = match[0] cls = name[ name.find( '.' ) + 1: ] if cls == '': self.next_style += 1 cls = 's%d' % self.next_style class_tags2.append( '%s.%s' % ( tag, cls ) ) body = ('%sclass=%s %s' % ( body[:n], cls, body[n:] )) new_doc.append( '%s<%s>' % ( doc[ col3: col1 ], body.strip() ) ) col3 = col2 + 1 # Concatenate all of the fragments to form the new document: new_doc.append( doc[ col3: ] ) return ''.join( new_doc ) #---------------------------------------------------------------------------- # Build the external style sheet used by all of the generated HTML pages: #---------------------------------------------------------------------------- def gen_stylesheet ( self ): # Get the CSS styles to output: styles = self.styles # Generate a header comment: sheet = [ self.css_comment[1:] % self.gen_msg.ljust( 74 ) ] # Filter out any CSS rules that have to be specified in a certain order # and generate them in that order: special = {} for style, names in styles.items(): for name in names[:]: if name.lower() in special_styles: names.remove( name ) special[ name.lower() ] = style for name in special_styles: style = special.get( name, None ) if style is not None: sheet.append( '%s {' % name ) sheet.extend( [ (' %s;' % item) for item in style.split(';') ] ) sheet.append( '}\n' ) # Generate the remaining rules (in arbitrary order): for style in styles.keys(): names = styles[ style ] if len( names ) > 0: sheet.append( '%s {' % ', '.join( names ) ) sheet.extend( [ (' %s;' % item) for item in style.split(';') ] ) sheet.append( '}\n' ) # Output the CSS styles to a file: self.write_file( self.root + '.css','\n'.join( sheet ) ) #---------------------------------------------------------------------------- # Generate an HTML/PHP page: #---------------------------------------------------------------------------- def gen_html ( self, htmldoc ): top_links, bottom_links = self.gen_page_links( htmldoc, self.top_links, self.bottom_links ) body = self.expand_template( self.navbar, htmldoc, { 'BODY': '%s%s%s' % ( top_links, self.process_body( htmldoc.doc ), bottom_links ) } ) self.write_file( htmldoc.fileref, self.expand_template( self.html_template, htmldoc, { 'HTML_PREFIX': self.html_header, 'GENERATOR': self.gen_msg, 'HEAD_TAGS': self.head_extra, 'BODY_ATTRS': self.body_tag, 'BODY_PREFIX': self.body_header, 'BODY': body } ) ) #---------------------------------------------------------------------------- # Generate the navigation links for a page: #---------------------------------------------------------------------------- def gen_page_links ( self, htmldoc, top_links, bottom_links ): docs = self.docs page = htmldoc.page # Check for page 0 (YakBack log), which has no previous/next page link: has_next = has_prev = False if page > 0: has_next = (page < self.max_page) has_prev = (page > 1) top_links = self.process_if( 'NEXT', has_next, top_links ) top_links = self.process_if( 'PREV', has_prev, top_links ) bottom_links = self.process_if( 'NEXT', has_next, bottom_links ) bottom_links = self.process_if( 'PREV', has_prev, bottom_links ) first = self.page_href( 1 ) toc = self.page_href( self.toc_page ) next = prev = next_title = prev_title = '' if has_next: next = self.page_href( page + 1 ) next_title = docs[ page ].title if has_prev: prev = self.page_href( page - 1 ) prev_title = docs[ page - 2 ].title dic = { 'FIRST': first, 'TOC': toc, 'NEXT': next, 'PREV': prev, 'NEXT_TITLE': next_title, 'PREV_TITLE': prev_title, 'PAGE_TITLE': htmldoc.title, 'TIME': self.gen_time } # Generate all the various $Hn$ and $Hn_TITLE$ values: last_level = LAST_HEADING + 1 headings = [ None ] * last_level cur_doc = htmldoc while cur_doc is not None: level = cur_doc.level if level < last_level: heading = ( cur_doc.title, cur_doc.href ) for i in range( level, last_level ): headings[i] = heading last_level = level cur_doc = cur_doc.parent if last_level > 1: heading = ( self.title, self.page_href( 1 ) ) for i in range( 1, last_level ): headings[i] = heading # Substitute all of the $Hn$ and $Hn_TITLE$ variables in the templates: for i in range( 1, LAST_HEADING + 1 ): title, link = headings[i] dic[ '$H%d_TITLE$' % i ] = title dic[ '$H%d$' % i ] = link # Expand and return the templates using the dictionary: return ( self.expand_template( top_links, htmldoc, dic ), self.expand_template( bottom_links, htmldoc, dic ) ) #---------------------------------------------------------------------------- # Perform template substitution: #---------------------------------------------------------------------------- def expand_template ( self, template, htmldoc, dic ): for name in doc_pat.findall( template ): name = name[1:-1] dic[ name ] = getattr( self, name[ 4: ].lower(), '' ) for name in page_pat.findall( template ): name = name[1:-1] dic[ name ] = getattr( htmldoc, name[ 5: ].lower(), '' ) for name, value in dic.items(): template = template.replace( '$%s$' % name, value ) return template #---------------------------------------------------------------------------- # Generate the correct conditional path (if present): #---------------------------------------------------------------------------- def process_if ( self, name, state, links ): if_pat = re.compile( r'\$IF_%s\$(.*?)\$END\$' % name, re.DOTALL | re.IGNORECASE ) while True: match = if_pat.search( links ) if match is None: break text = match.group(1) col = text.find( '$ELSE$' ) if col < 0: cases = [ '', text ] else: cases = [ text[ col + 6: ], text[ :col ] ] links = if_pat.sub( cases[ state ], links ) return links #---------------------------------------------------------------------------- # Generate the Microsoft Windows Help files: #---------------------------------------------------------------------------- def gen_help ( self ): # Generate the Help Workshop 'project' file: tp = self.toc_root op = self.root self.write_file( tp + '.hhp', self.hhp_body_template % ( tp, tp, op, tp, self.title, op ) ) # Generate the Help Workshop 'table of contents' file: toc = [] for htmldoc in self.htmldocs: htmldoc.gen_help( toc, 1 ) self.write_file( self.toc_root + '.hhc', self.hhc_body_template % ( self.gen_msg, ''.join( toc ) ) ) # Generate the standard 'stop word' list used by full-text search: self.write_file( tp + '.stp', self.stop_list.replace( ',', '\n' ) ) #---------------------------------------------------------------------------- # Generate the YakBack comment log: #---------------------------------------------------------------------------- def gen_yakback_log ( self ): doc = [ '<?php\nybtitle(); ' ] for htmldoc in self.docs: doc.append( 'yblog( "%s", %d );' % ( htmldoc.title, htmldoc.page ) ) doc.append( 'ybend();\n?>' ) htmldoc = HTMLDoc( self, '\n'.join( doc ), 'YakBack Comment Log', 0, 1, False ) htmldoc.gen_href() self.gen_html( htmldoc ) #---------------------------------------------------------------------------- # Add a document as a child document: #---------------------------------------------------------------------------- def add_document ( self, htmldoc ): self.htmldocs.append( htmldoc ) #---------------------------------------------------------------------------- # Return the last document added: #---------------------------------------------------------------------------- def last_document ( self ): if len( self.htmldocs ) == 0: return self return self.htmldocs[-1] #---------------------------------------------------------------------------- # Generator for each document: #---------------------------------------------------------------------------- def all_documents ( self ): for htmldoc in self.htmldocs: if not htmldoc.omit: yield htmldoc for child in htmldoc.all_documents(): yield child #---------------------------------------------------------------------------- # Split the document up at <Hn> boundaries: #---------------------------------------------------------------------------- def split_doc ( self ): if self.max_heading == 0: self.max_page = 1 self.htmldocs.append( HTMLDoc( self, self.doc, self.title, 1, 1 ) ) return level = 1 stack = [ ( level, self ) ] doc = self.doc title = self.title heading = '' hn_pat = re.compile( r'<h([%s]).*?>(.*?)</h\1>' % '123456789'[: self.max_heading ], re.DOTALL | re.IGNORECASE ) while True: match = hn_pat.search( doc ) if match is None: break self.process_doc( doc[: match.start() ].strip(), title, heading, level, stack ) level = int( match.group(1) ) title = strip_tags( match.group(2) ) heading = br_pat.sub( '', match.group() ) doc = doc[ match.end(): ] if title == '': title = self.title heading = '' else: heading += '\n\n' self.process_doc( doc.strip(), title, heading, level, stack ) #---------------------------------------------------------------------------- # Process a new document page: #---------------------------------------------------------------------------- def process_doc ( self, doc, title, heading, level, stack ): if strip_tags( doc ).strip() != '': page = self.max_page + 1 omit_list = self.omit_list omit = ((len( omit_list ) > 0) and (omit_list[0] == page)) if omit: self.omit_list = [ (x - 1) for x in omit_list if x != page ] else: self.max_page = page htmldoc = HTMLDoc( self, '%s%s' % ( heading, doc ), title, page, level, omit ) while level < stack[-1][0]: del stack[-1] parent = stack[-1][1] if level > stack[-1][0]: parent = parent.last_document() stack.append( ( level, parent ) ) parent.add_document( htmldoc ) #---------------------------------------------------------------------------- # Return the page 'href' for a specified page: #---------------------------------------------------------------------------- def page_href ( self, page ): return self.docs[ page - 1 ].href #---------------------------------------------------------------------------- # Return the final document body to be generated (can be overridden): #---------------------------------------------------------------------------- def process_body ( self, body ): return body #---------------------------------------------------------------------------- # Process all pages at once (can be overridden): #---------------------------------------------------------------------------- def process_pages ( self ): toc = [] for htmldoc in self.docs: toc.append( '<a href="%s">%s</a>' % ( htmldoc.href, htmldoc.title ) ) self.toc = '\n<br>'.join( toc ) #---------------------------------------------------------------------------- # Write a hex encoded string as a binary file to the target directory: #---------------------------------------------------------------------------- def write_hex_file ( self, filename, hex_data ): self.write_file( os.path.join( self.base_path, filename ), binascii.unhexlify( hex_data.replace( '\r', '' ).replace( '\n', '' ) ), 'wb' ) #------------------------------------------------------------------------------- # 'HTMLDoc' class: #------------------------------------------------------------------------------- class HTMLDoc: #---------------------------------------------------------------------------- # Initialize the object: #---------------------------------------------------------------------------- def __init__ ( self, splitter, doc, title, page, level, omit ): self.splitter = splitter self.doc = doc self.title = title self.page = page self.level = level self.omit = omit self.parent = None self.htmldocs = [] #---------------------------------------------------------------------------- # Add a document as a child document: #---------------------------------------------------------------------------- def add_document ( self, htmldoc ): self.htmldocs.append( htmldoc ) htmldoc.set_parent( self ) #---------------------------------------------------------------------------- # Set the parent HTMLDoc object for the document: #---------------------------------------------------------------------------- def set_parent ( self, parent ): self.parent = parent #---------------------------------------------------------------------------- # Return the last document added: #---------------------------------------------------------------------------- def last_document ( self ): return self.htmldocs[-1] #---------------------------------------------------------------------------- # Generator for each document: #---------------------------------------------------------------------------- def all_documents ( self ): for htmldoc in self.htmldocs: if not htmldoc.omit: yield htmldoc for child in htmldoc.all_documents(): yield child #---------------------------------------------------------------------------- # Generate the 'href' for the page: #---------------------------------------------------------------------------- def gen_href ( self ): splitter = self.splitter self.href = '%s_%d.%s' % ( splitter.base_root, self.page, splitter.file_ext ) self.fileref = os.path.join( splitter.base_path, self.href ) #---------------------------------------------------------------------------- # Parse all of the href links contained in the document: #---------------------------------------------------------------------------- def parse_hrefs ( self, hrefs ): for href in ahref_pat.findall( self.doc ): if href[:1] == '#': hrefs[ href[1:] ] = None #---------------------------------------------------------------------------- # Parse all of the href targets contained in the document: #---------------------------------------------------------------------------- def parse_targets ( self, hrefs, links ): page = self.page doc = self.doc new_doc = [] while True: match = aname_pat.search( doc ) if match is None: break target = match.group(1) if hrefs.has_key( target ): new_doc.append( doc[: match.end() ] ) links[ target ] = page else: new_doc.append( doc[: match.start() ] ) new_doc.append( match.group(2) ) doc = doc[ match.end(): ] new_doc.append( doc ) self.doc = ''.join( new_doc ) #---------------------------------------------------------------------------- # Patch all cross-file href links: #---------------------------------------------------------------------------- def patch_hrefs ( self, links ): page_href = self.splitter.page_href page = self.page doc = self.doc new_doc = [] while True: match = ahref_pat.search( doc ) if match is None: break href = match.group(1) if href[:1] == '#': ref_page = links.get( href[1:], None ) if ref_page!= None: if ref_page != page: new_doc.append( '%s<a href="%s%s"' % ( doc[: match.start() ], page_href( ref_page ), href ) ) else: new_doc.append( doc[: match.end() ] ) else: match2 = full_ahref_pat.search( doc ) if match2 is not None: new_doc.append( doc[: match2.start() ] + match2.group(1) ) match = match2 else: new_doc.append( doc[: match.end() ] ) doc = doc[ match.end(): ] new_doc.append( doc ) self.doc = ''.join( new_doc ) #---------------------------------------------------------------------------- # Generate the Microsoft Windows Help 'table of contents' entry for the # document: #---------------------------------------------------------------------------- def gen_help ( self, toc, indent ): entry = self.splitter.hhc_element_template % ( self.title, self.fileref ) toc.append( entry.replace( '\n', '\n' + ' ' * indent ) ) if len( self.htmldocs ) > 0: toc.append( '\n%s<UL>' % (' ' * (indent + 1)) ) for htmldoc in self.htmldocs: htmldoc.gen_help( toc, indent + 2 ) toc.append( '\n%s</UL>' % (' ' * (indent + 1)) ) #---------------------------------------------------------------------------- # Add the 'yakback' hooks into the document: #---------------------------------------------------------------------------- def gen_yakback ( self ): doc = self.doc new_doc = [] taglen = self.splitter.taglen index = 0 while True: match = yakback_pat.search( doc ) if match is None: break i = 1 body = match.group(1) if body is None: i = 2 body = match.group(2) if len( strip_tags( body ).strip() ) >= taglen: tail = '' body = body.strip() while body[-4:].lower() == '<br>': tail += '<br>' body = body[:-4].strip() new_doc.append( '%s<a name="yb_t%d"></a>%s<?= yb(%d,"%s") ?>%s' % ( doc[: match.start(i) ], index, body, index, doc[ match.end(i): match.end() ], tail ) ) index += 1 else: new_doc.append( doc[: match.end() ] ) doc = doc[ match.end(): ] new_doc.append( doc ) self.doc = ''.join( new_doc ) #------------------------------------------------------------------------------- # Strip all HTML tags from a specified string: #------------------------------------------------------------------------------- def strip_tags (string ): return tag_pat.sub( '', string ).replace( '\n', ' ' ) #------------------------------------------------------------------------------- # Main program: #------------------------------------------------------------------------------- if __name__ == '__main__': pydoh()