Mercurial > enso_core

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/enso/utils/xml_tools.py	Fri Feb 22 14:57:07 2008 -0600
@@ -0,0 +1,259 @@
+# Copyright (c) 2008, Humanized, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#    1. Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#
+#    2. Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#
+#    3. Neither the name of Enso nor the names of its contributors may
+#       be used to endorse or promote products derived from this
+#       software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY Humanized, Inc. ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL Humanized, Inc. BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# ----------------------------------------------------------------------------
+#
+#   enso.utils.xml_tools
+#
+# ----------------------------------------------------------------------------
+
+"""
+    XML utility functions.  This module is called "xml_tools" instead
+    of simply "xml" because that would cause a namespace conflict due
+    to Python 2.x's default prioritization of relative imports over
+    absolute imports (this will change in Py3k, though).
+"""
+
+# ----------------------------------------------------------------------------
+# Imports
+# ----------------------------------------------------------------------------
+
+import cStringIO
+import xml.sax.handler
+
+
+# ----------------------------------------------------------------------------
+# Public Constants
+# ----------------------------------------------------------------------------
+
+# Invalid control character ordinals that can't be included in
+# well-formed XML text.  These were determined by examining the ASCII
+# characters with the BT_NONXML type in the asciitab.h file of the
+# expat library.
+INVALID_CONTROL_CHARACTERS = [
+    0x00,
+    0x01,
+    0x02,
+    0x03,
+    0x04,
+    0x05,
+    0x06,
+    0x07,
+    0x08,
+    0x0b,
+    0x0c,
+    0x0e,
+    0x0f,
+    0x10,
+    0x11,
+    0x12,
+    0x13,
+    0x14,
+    0x15,
+    0x16,
+    0x17,
+    0x18,
+    0x19,
+    0x1a,
+    0x1b,
+    0x1c,
+    0x1d,
+    0x1e,
+    0x1f
+]
+
+
+# ----------------------------------------------------------------------------
+# Private Constants
+# ----------------------------------------------------------------------------
+
+# Unicode translation table to remove invalid control characters.
+_UNICODE_INVALID_CONTROL_CHARACTERS_TRANSLATION_TABLE = {}
+
+for char in INVALID_CONTROL_CHARACTERS:
+    _UNICODE_INVALID_CONTROL_CHARACTERS_TRANSLATION_TABLE[char] = None
+
+chars = []
+for char in range(256):
+    chars.append( chr(char) )
+
+# Identity transformation string for the str.translate() method.
+_STRING_IDENTITY_TRANSLATION = "".join( chars )
+
+chars = []
+for char in INVALID_CONTROL_CHARACTERS:
+    chars.append( chr(char) )
+
+# Deletechars string for the str.translate() method, used to remove
+# invalid control characters.
+_STRING_INVALID_CONTROL_CHARACTERS_DELETECHARS = "".join( chars )
+
+del char
+del chars
+
+
+# ----------------------------------------------------------------------------
+# DOM Node functions
+# ----------------------------------------------------------------------------
+
+def getInnerText( domNode ):
+    """
+    Returns a unicode string that is the amalgamation of all the text
+    interior to node domNode.  Recursively grabs the inner text from
+    all descendent (child, grandchild, etc.) nodes.
+    """
+
+    textStrings = []
+    for node in  domNode.childNodes:
+        if node.nodeType == domNode.TEXT_NODE \
+               or node.nodeType == domNode.CDATA_SECTION_NODE:
+            textStrings.append( node.data )
+        else:
+            textStrings.append( getInnerText( node ) )
+
+    return "".join( textStrings ).strip()
+
+
+def removeInvalidControlCharacters( string ):
+    """
+    Removes invalid control characters from the given string.  The
+    string can be a standard Python string or a unicode object.
+
+    Returns the string with the control characters removed; the
+    returned string is always of the same type as the string passed
+    in.
+    """
+
+    if isinstance( string, str ):
+        string = string.translate(
+            _STRING_IDENTITY_TRANSLATION,
+            _STRING_INVALID_CONTROL_CHARACTERS_DELETECHARS
+            )
+    elif isinstance( string, unicode ):
+        string = string.translate(
+            _UNICODE_INVALID_CONTROL_CHARACTERS_TRANSLATION_TABLE
+            )
+    else:
+        raise AssertionError( "string must be a string or unicode object." )
+    return string
+
+
+def escapeXml( xmlData ):
+    """
+    Returns a string in which all the xml characters of xmlData have
+    been escaped once (e.g., "&" -> "&amp;", and "<" -> "&lt;"), and
+    also removes any invalid control characters from xmlData.
+    """
+
+    xmlData = xmlData.replace( "&", "&amp;" )
+    xmlData = xmlData.replace( "<", "&lt;" )
+    # This is needed to escape the sequence "]]>"
+    xmlData = xmlData.replace( ">", "&gt;" )
+    return removeInvalidControlCharacters( xmlData )
+
+
+# ----------------------------------------------------------------------------
+# Xml Identy Sax Handler Class
+# ----------------------------------------------------------------------------
+
+class XmlIdentityHandler( xml.sax.handler.ContentHandler ):
+    """
+    TODO: Document this class and its methods.
+    """
+
+    def __init__( self, outFile = None ):
+        xml.sax.handler.ContentHandler.__init__( self )
+        if outFile:
+            self.output = outFile
+        else:
+            self.output = cStringIO.StringIO()
+
+    def writeStartTag( self, tag, attrs = None ):
+        self.output.write( "<" )
+        self.output.write( tag )
+        if attrs:
+            for key in attrs.keys():
+                value = attrs[key]
+                text = " %s=\"%s\""
+                self.output.write( text % (key,value) )
+        self.output.write( ">" )
+
+    def writeEndTag( self, tag ):
+        self.output.write( "</" )
+        self.output.write( tag )
+        self.output.write( ">" )
+
+    def characters( self, chars ):
+        self.output.write( escapeXml( chars ) )
+
+    def startElement( self, tag, attrs ):
+        self.writeStartTag( tag, attrs )
+
+    def endElement( self, tag ):
+        self.writeEndTag( tag )
+
+    def processingInstruction( self, target, data ):
+        self.output.write( "<?" )
+        self.output.write( target )
+        self.output.write( " " )
+        self.output.write( data )
+        self.output.write( ">" )
+
+
+def runTransform( handler, file, parser ):
+    """
+    TODO: Document this function.
+    """
+
+    parser.setContentHandler( handler )
+    parser.parse( file )
+    handler.output.seek( 0 )
+    return handler.output
+
+
+# ----------------------------------------------------------------------------
+# "Directory" Entity Resolver
+# ----------------------------------------------------------------------------
+
+class DirResolver( xml.sax.handler.EntityResolver ):
+    """
+    TODO: Document this class and its methods.
+    """
+
+    def __init__( self, dir ):
+        self.dir = dir
+
+    def resolveEntity( self, publicId, systemId ):
+        # Stop pychecker from complaining about unused args...
+        dummy = publicId
+
+        import os.path
+        fileName = os.path.join( self.dir, systemId )
+        if os.path.exists( fileName ):
+            return fileName
+        return systemId
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test_xml_tools.py	Fri Feb 22 14:57:07 2008 -0600
@@ -0,0 +1,124 @@
+"""
+    Test cases for enso.utils.xml_tools.
+"""
+
+# ----------------------------------------------------------------------------
+# Imports
+# ----------------------------------------------------------------------------
+
+import unittest
+import xml.dom.minidom
+import xml.sax
+
+from enso.utils import xml_tools
+
+
+# ----------------------------------------------------------------------------
+# Unit Tests
+# ----------------------------------------------------------------------------
+
+class InnerTextTests( unittest.TestCase ):
+
+
+    TESTS = []
+    TESTS.append( """<xml><![CDATA[abcdefg]]>hij<A>kl<B>mn</B>opqrs<C>t</C><![CDATA[uvw]]>x</A>yz</xml>""" )
+    TESTS.append( """<xml><![CDATA[abcdefghijklmnopqrstuvwxzy]]></xml>""" )
+    TESTS.append( """<xml>a<![CDATA[b]]>cdefghijklmnopqrstuvwxyz</xml>""" )
+
+    def testInvalidControlCharactersRaiseExceptions( self ):
+        handler = xml.sax.handler.ContentHandler()
+        for char in xml_tools.INVALID_CONTROL_CHARACTERS:
+            xmlStr = "<test>%s</test>" % chr( char )
+            self.assertRaises(
+                xml.sax.SAXParseException,
+                xml.sax.parseString,
+                xmlStr,
+                handler
+                )
+
+    def testRemoveInvalidControlCharactersWorksOnNonUnicodeStrings( self ):
+        handler = xml.sax.handler.ContentHandler()
+        for char in xml_tools.INVALID_CONTROL_CHARACTERS:
+            xmlStr = "<test>%s</test>" % chr( char )
+            xmlStr = xml_tools.removeInvalidControlCharacters( xmlStr )
+            self.assertEquals( xmlStr, "<test></test>" )
+            xml.sax.parseString( xmlStr, handler )
+
+    def testRemoveInvalidControlCharactersWorksOnUnicodeStrings( self ):
+        handler = xml.sax.handler.ContentHandler()
+        for char in xml_tools.INVALID_CONTROL_CHARACTERS:
+            xmlStr = u"<test>%s\u2026</test>" % chr( char )
+            xmlStr = xml_tools.removeInvalidControlCharacters( xmlStr )
+            self.assertEquals( xmlStr, u"<test>\u2026</test>" )
+            xml.sax.parseString( xmlStr, handler )
+
+    def testEscapeXmlRemovesInvalidControlCharacters( self ):
+        for char in xml_tools.INVALID_CONTROL_CHARACTERS:
+            xmlStr = u"%s\u2026" % chr( char )
+            self.assertEquals( xml_tools.escapeXml(xmlStr),
+                               u"\u2026" )
+            xmlStr = "%shi" % chr( char )
+            self.assertEquals( xml_tools.escapeXml(xmlStr),
+                               "hi" )
+            return xmlStr
+
+    def testWierdCases( self ):
+        for case in self.TESTS:
+            xmlData = case
+            innerText = "abcdefghijklmnopqrstuvwxyz"
+
+        document = xml.dom.minidom.parseString( xmlData )
+
+        results = xml_tools.getInnerText( document )
+        self.failUnlessEqual( results, innerText )
+
+    def testManyTags( self ):
+
+        cases = range( ord("a"), ord("z") )
+        cases = [ chr( c ) for c in cases ]
+
+        xmlData = "<xml>\n"
+        for c in cases:
+            xmlData += ( "  <%s>%s</%s>\n" % ( c, c.upper(), c ) )
+        xmlData += "</xml>"
+
+        document = xml.dom.minidom.parseString( xmlData )
+
+        for c in cases:
+            nodes = document.getElementsByTagName( c )
+            # There should be only one dom node for each tag name.
+            self.failUnlessEqual( len(nodes), 1 )
+            # The inner xml should be exactly the tag name, uppercased.
+            results = xml_tools.getInnerText( nodes[0] )
+            self.failUnlessEqual( results, c.upper() )
+
+    def testMuchRecursion( self ):
+        cases = range( ord("a"), ord("z") )
+        cases = [ chr( c ) for c in cases ]
+
+        xmlData = "<xml>%s</xml>"
+        innerText = ""
+        for c in cases[:-1]:
+            xmlData %= ( "<%s>%s</%s>" % ( c, c.upper()+"%s", c ) )
+            innerText += c.upper()
+        c = cases[-1]
+        xmlData %= ( "<%s>%s</%s>" % ( c, c.upper(), c ) )
+        innerText += c.upper()
+
+        document = xml.dom.minidom.parseString( xmlData )
+
+        # There should be only one dom node for the "xml" tag name.
+        nodes = document.getElementsByTagName( "xml" )
+        self.failUnlessEqual( len(nodes), 1 )
+
+        # The inner xml should be exactly the tag name, uppercased.
+        results = xml_tools.getInnerText( nodes[0] )
+        self.failUnlessEqual( results, innerText )
+
+
+# ----------------------------------------------------------------------------
+# Script
+# ----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    unittest.main()