0001#!/usr/bin/env python
0002# _*_ coding: iso-8859-1 _*_
0003
0004"""This is Textile
0005A Humane Web Text Generator
0006
0007TODO:
0008* Make it work with Python 2.1.
0009* Make it work with Python 1.5.2? Or that's too optimistic?
0010
0011---
0012To get an overview of all PyTextile's features, simply
0013type 'tell me about textile.' in a single line.
0014"""
0015
0016__authors__ = ["Roberto A. F. De Almeida (roberto@dealmeida.net)",
0017 "Mark Pilgrim (f8dy@diveintomark.org)"]
0018__version__ = "2.0.10"
0019__date__ = "2004/10/06"
0020__copyright__ = """
0021Copyright (c) 2004, Roberto A. F. De Almeida, http://dealmeida.net/
0022Copyright (c) 2003, Mark Pilgrim, http://diveintomark.org/
0023All rights reserved.
0024
0025Original PHP version:
0026Version 1.0
002721 Feb, 2003
0028
0029Copyright (c) 2003, Dean Allen, www.textism.com
0030All rights reserved.
0031
0032Parts of the documentation and some of the regular expressions are (c) Brad
0033Choate, http://bradchoate.com/. Thanks, Brad!
0034"""
0035__license__ = """
0036Redistribution and use in source and binary forms, with or without
0037modification, are permitted provided that the following conditions are met:
0038
0039* Redistributions of source code must retain the above copyright notice,
0040 this list of conditions and the following disclaimer.
0041
0042* Redistributions in binary form must reproduce the above copyright notice,
0043 this list of conditions and the following disclaimer in the documentation
0044 and/or other materials provided with the distribution.
0045
0046* Neither the name Textile nor the names of its contributors may be used to
0047 endorse or promote products derived from this software without specific
0048 prior written permission.
0049
0050THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
0051AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
0052IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
0053ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
0054LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
0055CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
0056SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
0057INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
0058CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
0059ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
0060POSSIBILITY OF SUCH DAMAGE.
0061"""
0062__history__ = """
00631.0 - 2003/03/19 - MAP - initial release
00641.01 - 2003/03/19 - MAP - don't strip whitespace within <pre> tags;
0065 map high-bit ASCII to HTML numeric entities
00661.02 - 2003/03/19 - MAP - changed hyperlink qtag expression to only
0067 match valid URL characters (per RFC 2396); fixed preg_replace to
0068 not match across line breaks (solves lots of problems with
0069 mistakenly matching overlapping inline markup); fixed whitespace
0070 stripping to only strip whitespace from beginning and end of lines,
0071 not immediately before and after HTML tags.
00721.03 - 2003/03/20 - MAP - changed hyperlink qtag again to more
0073 closely match original Textile (fixes problems with links
0074 immediately followed by punctuation -- somewhere Dean is
0075 grinning right now); handle curly apostrophe with "ve"
0076 contraction; clean up empty titles at end.
00771.04 - 2003/03/23 - MAP - lstrip input to deal with extra spaces at
0078 beginning of first line; tweaked list loop to handle consecutive lists
00791.1 - 2003/06/06 - MAP - created initial test suite for links and images,
0080 and fixed a bunch of related bugs to pass them
00811.11 - 2003/07/20 - CL - don't demoronise unicode strings; handle
0082 "they're" properly
00831.12 - 2003/07/23 - GW - print debug messages to stderr; handle bq(cite).
00841.13 - 2003/07/23 - MAP - wrap bq. text in <p>...</p>
00852 - 2004/03/26 - RAFA - rewritten from (almost) scratch to include
0086 all features from Textile 2 and a little bit more.
00872.0.1 - 2004/04/02 - RAFA - Fixed validating function that uses uTidyLib.
00882.0.2 - 2004/04/02 - RAFA - Fixed problem with caps letters in URLs.
00892.0.3 - 2004/04/19 - RAFA - Multiple classes are allowed, thanks to Dave
0090 Anderson. The "lang" attribute is now removed from <code>, to be valid
0091 XHTML. Fixed <span class="caps">UCAS</span> problem.
00922.0.4 - 2004/05/20 - RAFA, CLB - Added inline formatting to table cells.
0093 Curt Bergmann fixed a bug with the colspan formatting. Added Amazon
0094 Associated id.
00952.0.5 - 2004/06/01 - CL - Applied patch from Chris Lawrence to (1) fix
0096 that Amazon associates ID was being added to all search URIs, (2)
0097 customize the Amazon site used with the AMAZON variable, and (3) added
0098 an "isbn" URI type that links directly to an Amazon product by ISBN or
0099 Amazon ASIN.
01002.0.6 - 2004/06/02 - RAFA - Fixed CAPS problem, again. I hope this is
0101 the last time.
01022.0.7 - 2004/06/04 - RAFA, MW - Fixed bullet macro, thanks to Adam
0103 Messinger. Added patch from Michal Wallace changing {}.pop() for
0104 compatibility with Python 2.2.x.
01052.0.8 - 2004/06/25 - RAFA - Strip tags when adding the content from a
0106 footnote to the reference link. Escaped '<' and '>' in the self-
0107 generated documentation.
01082.0.9 - 2004/10/04 - RAFA - In images, if ALT is not defined, add an
0109 empty attribute. Added "LaTeX" style open/close quotes. Fixed a bug
0110 where the acronym definition was being formatted with inline rules.
0111 Handle "broken" lines correctly, removing the <br /> from inside
0112 split HTML tags.
01132.0.10 - 2004/10/06 - RAFA, LO - Escape all non-escaped ampersands.
0114 Applied "trivial patch" from Ludvig Omholt to remove newline right
0115 after the <pre> tag.
0116"""
0117
0118# Set your encoding here.
0119ENCODING = 'latin-1'
0120
0121# Output? Non-ASCII characters will be automatically
0122# converted to XML entities if you choose ASCII.
0123OUTPUT = 'ascii'
0124
0125# PyTextile can optionally validate the generated
0126# XHTML code. We can use either mxTidy or uTidyLib.
0127# You can change the default behaviour here.
0128VALIDATE = 0
0129
0130# If you want h1. to be translated to something other
0131# than <h1>, change this offset. You can also pass it
0132# as an argument to textile().
0133HEAD_OFFSET = 0
0134
0135# If you want to use itex2mml, specify the full path
0136# to the binary here. You can download it from here:
0137# http://golem.ph.utexas.edu/~distler/blog/files/itexToMML.tar.gz
0138itex2mml = None
0139#itex2mml = '/usr/local/bin/itex2MML'
0140#itex2mml = '/usr/people/almeida/bin/itex2MML'
0141
0142# PyTextile can optionally sanitize the generated XHTML,
0143# which is good for weblog comments or if you don't trust
0144# yourself.
0145SANITIZE = 0
0146
0147# Turn debug on?
0148DEBUGLEVEL = 0
0149
0150# Amazon associate for links: "keywords":amazon
0151# If you don't have one, please consider leaving mine here as
0152# a small compensation for writing PyTextile. It's commented
0153# off as default.
0154#amazon_associate_id = 'bomtempo-21'
0155amazon_associate_id = None
0156
0157#AMAZON = 'www.amazon.co.uk'
0158AMAZON = 'www.amazon.com'
0159
0160import re
0161import sys
0162import os
0163import sgmllib
0164import unicodedata
0165
0166
0167def _in_tag(text, tag):
0168 """Extracts text from inside a tag.
0169
0170 This function extracts the text from inside a given tag.
0171 It's useful to get the text between <body></body> or
0172 <pre></pre> when using the validators or the colorizer.
0173 """
0174 if text.count('<%s' % tag):
0175 text = text.split('<%s' % tag, 1)[1]
0176 if text.count('>'):
0177 text = text.split('>', 1)[1]
0178 if text.count('</%s' % tag):
0179 text = text.split('</%s' % tag, 1)[0]
0180
0181 text = text.strip().replace('\r\n', '\n')
0182
0183 return text
0184
0185
0186# If you want PyTextile to automatically colorize
0187# your Python code, you need the htmlizer module
0188# from Twisted. (You can just grab this file from
0189# the distribution, it has no other dependencies.)
0190try:
0191 #from twisted.python import htmlizer
0192 import htmlizer
0193 from StringIO import StringIO
0194
0195 def _color(code):
0196 """Colorizer Python code.
0197
0198 This function wraps a text string in a StringIO,
0199 and passes it to the htmlizer function from
0200 Twisted.
0201 """
0202 # Fix line continuations.
0203 code = preg_replace(r' \\\n', ' \\\\\n', code)
0204
0205 code_in = StringIO(code)
0206 code_out = StringIO()
0207
0208 htmlizer.filter(code_in, code_out)
0209
0210 # Remove <pre></pre> from input.
0211 code = _in_tag(code_out.getvalue(), 'pre')
0212
0213 # Fix newlines.
0214 code = code.replace('<span class="py-src-newline">\n</span>', '<span class="py-src-newline"></span>\n')
0215
0216 return code
0217
0218except ImportError:
0219 htmlizer = None
0220
0221
0222# PyTextile can optionally validate the generated
0223# XHTML code using either mxTidy or uTidyLib.
0224try:
0225 # This is mxTidy.
0226 from mx.Tidy import Tidy
0227
0228 def _tidy1(text):
0229 """mxTidy's XHTML validator.
0230
0231 This function is a wrapper to mxTidy's validator.
0232 """
0233 nerrors, nwarnings, text, errortext = Tidy.tidy(text, output_xhtml=1, numeric_entities=1, wrap=0)
0234 return _in_tag(text, 'body')
0235
0236 _tidy = _tidy1
0237
0238except ImportError:
0239 try:
0240 # This is uTidyLib.
0241 import tidy
0242
0243 def _tidy2(text):
0244 """uTidyLib's XHTML validator.
0245
0246 This function is a wrapper to uTidyLib's validator.
0247 """
0248 text = tidy.parseString(text, output_xhtml=1, add_xml_decl=0, indent=0, tidy_mark=0)
0249 return _in_tag(str(text), 'body')
0250
0251 _tidy = _tidy2
0252
0253 except ImportError:
0254 _tidy = None
0255
0256
0257# This is good for debugging.
0258def _debug(s, level=1):
0259 """Outputs debug information to sys.stderr.
0260
0261 This function outputs debug information if DEBUGLEVEL is
0262 higher than a given treshold.
0263 """
0264 if DEBUGLEVEL >= level: print >> sys.stderr, s
0265
0266
0267#############################
0268# Useful regular expressions.
0269parameters = {
0270 # Horizontal alignment.
0271 'align': r'''(?:(?:<>|[<>=]) # Either '<>', '<', '>' or '='
0272 (?![^\s]*(?:<>|[<>=]))) # Look-ahead to ensure it happens once
0273 ''',
0274
0275 # Horizontal padding.
0276 'padding': r'''(?:[\(\)]+) # Any number of '(' and/or ')'
0277 ''',
0278
0279 # Class and/or id.
0280 'classid': r'''( #
0281 (?:\(\#[\w]+\)) # (#id)
0282 | #
0283 (?:\((?:[\w]+(?:\s[\w]+)*) #
0284 (?:\#[\w]+)?\)) # (class1 class2 ... classn#id) or (class1 class2 ... classn)
0285 ) #
0286 (?![^\s]*(?:\([\w#]+\))) # must happen once
0287 ''',
0288
0289 # Language.
0290 'lang': r'''(?:\[[\w-]+\]) # [lang]
0291 (?![^\s]*(?:\[.*?\])) # must happen once
0292 ''',
0293
0294 # Style.
0295 'style': r'''(?:{[^\}]+}) # {style}
0296 (?![^\s]*(?:{.*?})) # must happen once
0297 ''',
0298}
0299
0300res = {
0301 # Punctuation.
0302 'punct': r'''[\!"#\$%&'()\*\+,\-\./:;<=>\?@\[\\\]\^_`{\|}\~]''',
0303
0304 # URL regular expression.
0305 'url': r'''(?=[a-zA-Z0-9./#]) # Must start correctly
0306 (?: # Match the leading part (proto://hostname, or just hostname)
0307 (?:ftp|https?|telnet|nntp) # protocol
0308 :// # ://
0309 (?: # Optional 'username:password@'
0310 \w+ # username
0311 (?::\w+)? # optional :password
0312 @ # @
0313 )? #
0314 [-\w]+(?:\.\w[-\w]*)+ # hostname (sub.example.com)
0315 | #
0316 (?:mailto:)? # Optional mailto:
0317 [-\+\w]+ # username
0318 \@ # at
0319 [-\w]+(?:\.\w[-\w]*)+ # hostname
0320 | #
0321 (?:[a-z0-9](?:[-a-z0-9]*[a-z0-9])?\.)+ # domain without protocol
0322 (?:com\b # TLD
0323 | edu\b #
0324 | biz\b #
0325 | gov\b #
0326 | in(?:t|fo)\b # .int or .info
0327 | mil\b #
0328 | net\b #
0329 | org\b #
0330 | museum\b #
0331 | aero\b #
0332 | coop\b #
0333 | name\b #
0334 | pro\b #
0335 | [a-z][a-z]\b # two-letter country codes
0336 ) #
0337 )? #
0338 (?::\d+)? # Optional port number
0339 (?: # Rest of the URL, optional
0340 /? # Start with '/'
0341 [^.!,?;:"'<>()\[\]{}\s\x7F-\xFF]* # Can't start with these
0342 (?: #
0343 [.!,?;:]+ # One or more of these
0344 [^.!,?;:"'<>()\[\]{}\s\x7F-\xFF]+ # Can't finish with these
0345 #'" # # or ' or "
0346 )* #
0347 )? #
0348 ''',
0349
0350
0351 # Block attributes.
0352 'battr': r'''(?P<parameters> #
0353 (?: %(align)s # alignment
0354 | %(classid)s # class and/or id
0355 | %(padding)s # padding tags
0356 | %(lang)s # [lang]
0357 | %(style)s # {style}
0358 )+ #
0359 )? #
0360 ''' % parameters,
0361
0362 # (Un)ordered list attributes.
0363 'olattr': r'''(?P<olparameters> #
0364 (?: %(align)s # alignment
0365 | ((?:\(\#[\w]+\)) # (#id)
0366 | #
0367 (?:\((?:[\w]+(?:\s[\w]+)*) #
0368 (?:\#[\w]+)?\)) # (class1 class2 ... classn#id) or (class1 class2 ... classn)
0369 ) #
0370 | %(padding)s # padding tags
0371 | %(lang)s # [lang]
0372 | %(style)s # {style}
0373 )+ #
0374 )? #
0375 ''' % parameters,
0376
0377 # List item attributes.
0378 'liattr': r'''(?P<liparameters> #
0379 (?: %(align)s # alignment
0380 | %(classid)s # class and/or id
0381 | %(padding)s # padding tags
0382 | %(lang)s # [lang]
0383 | %(style)s # {style}
0384 )+ #
0385 )? #
0386 ''' % parameters,
0387
0388 # Qtag attributes.
0389 'qattr': r'''(?P<parameters> #
0390 (?: %(classid)s # class and/or id
0391 | %(lang)s # [lang]
0392 | %(style)s # {style}
0393 )+ #
0394 )? #
0395 ''' % parameters,
0396
0397 # Link attributes.
0398 'lattr': r'''(?P<parameters> # Links attributes
0399 (?: %(align)s # alignment
0400 | %(classid)s # class and/or id
0401 | %(lang)s # [lang]
0402 | %(style)s # {style}
0403 )+ #
0404 )? #
0405 ''' % parameters,
0406
0407 # Image attributes.
0408 'iattr': r'''(?P<parameters> #
0409 (?: #
0410 (?: [<>]+ # horizontal alignment tags
0411 (?![^\s]*(?:[<>]))) # (must happen once)
0412 | #
0413 (?: [\-\^~]+ # vertical alignment tags
0414 (?![^\s]*(?:[\-\^~]))) # (must happen once)
0415 | %(classid)s # class and/or id
0416 | %(padding)s # padding tags
0417 | %(style)s # {style}
0418 )+ #
0419 )? #
0420 ''' % parameters,
0421
0422 # Resize attributes.
0423 'resize': r'''(?: #
0424 (?:([\d]+%?)x([\d]+%?)) # 20x10
0425 | #
0426 (?: # or
0427 (?:([\d]+)%?w\s([\d]+)%?h) # 10h 20w
0428 | # or
0429 (?:([\d]+)%?h\s([\d]+)%?w) # 20w 10h
0430 ) #
0431 )? #
0432 ''',
0433
0434 # Table attributes.
0435 'tattr': r'''(?P<parameters> #
0436 (?: #
0437 (?: [\^~] # vertical alignment
0438 (?![^\s]*(?:[\^~]))) # (must happen once)
0439 | %(align)s # alignment
0440 | %(lang)s # [lang]
0441 | %(style)s # {style}
0442 | %(classid)s # class and/or id
0443 | %(padding)s # padding
0444 | _ # is this a header row/cell?
0445 | \\\d+ # colspan
0446 | /\d+ # rowspan
0447 )+ #
0448 )? #
0449 ''' % parameters,
0450}
0451
0452
0453def preg_replace(pattern, replacement, text):
0454 """Alternative re.sub that handles empty groups.
0455
0456 This acts like re.sub, except it replaces empty groups with ''
0457 instead of raising an exception.
0458 """
0459
0460 def replacement_func(matchobj):
0461 counter = 1
0462 rc = replacement
0463 _debug(matchobj.groups())
0464 for matchitem in matchobj.groups():
0465 if not matchitem:
0466 matchitem = ''
0467
0468 rc = rc.replace(r'\%s' % counter, matchitem)
0469 counter += 1
0470
0471 return rc
0472
0473 p = re.compile(pattern)
0474 _debug(pattern)
0475
0476 return p.sub(replacement_func, text)
0477
0478
0479def html_replace(pattern, replacement, text):
0480 """Replacement outside HTML tags.
0481
0482 Does a preg_replace only outside HTML tags.
0483 """
0484 # If there is no html, do a simple search and replace.
0485 if not re.search(r'''<.*>''', text):
0486 return preg_replace(pattern, replacement, text)
0487
0488 else:
0489 lines = []
0490 # Else split the text into an array at <>.