changeset 11:bfca1c0eccea

Added enso.utils.strings and unit tests for it.
author Atul Varma <varmaa@toolness.com>
date Fri, 22 Feb 2008 14:27:22 -0600
parents 01bd04cb9ba8
children 40f72f6cd6eb
files README enso/utils/__init__.py enso/utils/strings.py tests/test_strings.py
diffstat 3 files changed, 257 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/README	Fri Feb 22 07:30:36 2008 -0600
+++ b/README	Fri Feb 22 14:27:22 2008 -0600
@@ -0,0 +1,6 @@
+Enso Readme
+===========
+
+For the time being, in order to use Enso, you need to add the root
+directory of the Enso source tree to your PYTHONPATH environment
+variable.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/enso/utils/strings.py	Fri Feb 22 14:27:22 2008 -0600
@@ -0,0 +1,169 @@
+# Copyright (c) 2008, Humanized, Inc.
+# All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#    1. Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#
+#    2. Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#
+#    3. Neither the name of Enso nor the names of its contributors may
+#       be used to endorse or promote products derived from this
+#       software without specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY Humanized, Inc. ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL Humanized, Inc. BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# ----------------------------------------------------------------------------
+#
+#   enso.utils.strings
+#
+# ----------------------------------------------------------------------------
+
+"""
+    Various string utility methods.
+"""
+
+# ----------------------------------------------------------------------------
+# Constants
+# ----------------------------------------------------------------------------
+
+# Double "smart quotes".
+OPEN_QUOTE = u"\u201C"
+CLOSE_QUOTE = u"\u201D"
+
+# Single "smart quotes".
+OPEN_SINGLE_QUOTE = u"\u2018"
+CLOSE_SINGLE_QUOTE = u"\u2019"
+
+
+# ----------------------------------------------------------------------------
+# String utility functions
+# ----------------------------------------------------------------------------
+
+def smartQuote( text ):
+    """
+    Replaces regular quotes in text with "smart quotes", i.e., left and right
+    facing quotes, and returns the result as a unicode object.
+
+    NOTE: This uses a very simple algorithm; if you are trying to quote
+    an arbitrary chunk of text, it would be best to use this function
+    on your formatting string, e.g., use this on:
+        ' %s ' - output from blah command
+    before you apply the formatting operation that dumps unknown text.
+    """
+
+    text = _smartDoubleQuote( text )
+    text = _smartSingleQuote( text )
+
+    return text
+
+
+def _smartSingleQuote( inText ):
+    """
+    Replaces single quotes with "smart quotes", i.e., forward
+    and back facing quotes, except for single quotes that are
+    parts of certain contractions.
+    """
+    
+    # Explicitly copy the text and cast it to unicode.
+    outText = unicode( inText[:] )
+
+    # There are two usages of single quote marks; for
+    # quotations, and for contractions.
+
+    # First, we escape the contraction cases.  Then,
+    # without those pesky apostrophes, we will be free
+    # and clear to replace the remaining single quotes
+    # with smart quotes.
+
+    cases = [ "'s", "'t", "'nt", "I'm", "'ve", "'re", ]
+    for case in cases:
+        tempText = "<<|%s|>>" % case.replace( "'", "" )
+        outText = outText.replace( case, tempText )
+
+    # Now that there are no apostrophes, we can run through
+    # the text, replacing each pair of single quotes with
+    # opening and closing 'smart single quotes'.
+    while outText.count( "'" ) > 0:
+        outText = outText.replace( "'", OPEN_SINGLE_QUOTE, 1)
+        outText = outText.replace( "'", CLOSE_SINGLE_QUOTE, 1)
+
+    # Now we have to replace the contraction escape sequences
+    # with the original contractions.
+    for case in cases:
+        tempText = "<<|%s|>>" % case.replace( "'", "" )
+        outText = outText.replace( tempText, case )
+
+    return outText
+
+
+def _smartDoubleQuote( inText ):
+    """
+    Replaces double quotes with "smart quotes", i.e., forward
+    and back facing quotes.
+    """
+    
+    # Explicitly copy the text and cast it to unicode.
+    outText = unicode( inText[:] )
+    while outText.count( "\"" ) > 0:
+        outText = outText.replace( "\"", OPEN_QUOTE, 1)
+        outText = outText.replace( "\"", CLOSE_QUOTE, 1)
+    return outText
+    
+
+def stringRatio( a, b ):
+    """
+    Calculates the string ratio of a to b.
+
+    If the strings are equal, returns 1.0.  If they have no similarity
+    whatsoever, returns 0.0.  Otherwise, returns a number in-between.
+    """
+
+    if a == b:
+        return 1.0
+    elif a in b:
+        return float( len(a) ) / len(b)
+    elif b in a:
+        return float( len(b) ) / len(a)
+    else:
+        # The following code is actually identical to this code:
+        #
+        #  import difflib
+        #  seqMatch = difflib.SequenceMatcher( False, a, b )
+        #  ratio = seqMatch.real_quick_ratio()
+        #  return ratio
+        #
+        # But has been copied from difflib and pasted inline here for
+        # efficiency purposes.
+        
+        la, lb = len(a), len(b)
+
+        length = la + lb
+        if length:
+            return 2.0 * (min(la, lb)) / length
+        return 1.0
+
+
+def stringRatioBestMatch( item, sequence ):
+    """
+    Uses a string ratio algorithm to find to the best match
+    to item among the elements of sequence.
+    """
+
+    ratios = [ stringRatio( item, element ) \
+               for element in sequence ]
+
+    return sequence[ ratios.index( min(ratios) ) ]
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test_strings.py	Fri Feb 22 14:27:22 2008 -0600
@@ -0,0 +1,82 @@
+"""
+    Tests for the enso.utils.strings module.
+"""
+
+# ----------------------------------------------------------------------------
+# Imports
+# ----------------------------------------------------------------------------
+
+import random        
+import unittest
+
+from enso.utils.strings import smartQuote
+from enso.utils.strings import stringRatio
+from enso.utils.strings import stringRatioBestMatch
+from enso.utils.strings import CLOSE_QUOTE
+from enso.utils.strings import OPEN_QUOTE
+from enso.utils.strings import CLOSE_SINGLE_QUOTE
+from enso.utils.strings import OPEN_SINGLE_QUOTE
+
+
+# ----------------------------------------------------------------------------
+# Unit Tests
+# ----------------------------------------------------------------------------
+
+class SmartQuoteTest( unittest.TestCase ):
+    NULL_CASES = [
+        "I can't.",
+        "You're can't.",
+        "I'm sick of you're inability.",
+        "She's sick of he's being sick.",
+        "I've been Elmer's gran-dad.",
+        "He's sick of this.",
+        ]
+
+    def testNullCases( self ):
+        for testString in self.NULL_CASES:
+            self.failUnlessEqual( testString,
+                                  smartQuote( testString ) )
+
+    CASES = [
+        ( "\"ABC\"", OPEN_QUOTE + "ABC" + CLOSE_QUOTE ),
+        ( "A\"BC\"", "A" + OPEN_QUOTE + "BC" + CLOSE_QUOTE ),
+        ( "A\"B\"C", "A" + OPEN_QUOTE + "B" + CLOSE_QUOTE + "C" ),
+        ( "'ABC'",
+          OPEN_SINGLE_QUOTE + "ABC" + CLOSE_SINGLE_QUOTE ),
+        ( "A'BC'",
+          "A" + OPEN_SINGLE_QUOTE + "BC" + CLOSE_SINGLE_QUOTE ),
+        ( "A'B'C",
+          "A" + OPEN_SINGLE_QUOTE + "B" + CLOSE_SINGLE_QUOTE + "C" ),
+        ]
+
+    def testCases( self ):
+        for source, target in self.CASES:
+            self.failUnlessEqual( target,
+                                  smartQuote( source ) )
+            
+
+class StringComparisonTests( unittest.TestCase ):
+    def setUp( self ):
+        random.seed( 0 )
+    
+
+    def testRandomIdentities( self ):
+        MAX_LENGTH = 30
+        CHARS = "abcdefghijklmnopqrstuvwxyz"
+        CHARS += CHARS.upper()
+        CHARS += "`1234567890-=~!@#$%^&*()_+[]\\{}|;':\",./<>?"
+        for i in range( MAX_LENGTH ):
+            for j in range( MAX_LENGTH ):
+                string = random.sample( CHARS, i )
+                string += random.sample( CHARS, j )
+                string = "".join( string )
+                self.failUnlessEqual( stringRatio( string[:], string[:] ),
+                                      1 )
+
+
+# ----------------------------------------------------------------------------
+# Script
+# ----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    unittest.main()