Mercurial > enso_core
changeset 11:bfca1c0eccea
Added enso.utils.strings and unit tests for it.
author | Atul Varma <varmaa@toolness.com> |
---|---|
date | Fri, 22 Feb 2008 14:27:22 -0600 |
parents | 01bd04cb9ba8 |
children | 40f72f6cd6eb |
files | README enso/utils/__init__.py enso/utils/strings.py tests/test_strings.py |
diffstat | 3 files changed, 257 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/README Fri Feb 22 07:30:36 2008 -0600 +++ b/README Fri Feb 22 14:27:22 2008 -0600 @@ -0,0 +1,6 @@ +Enso Readme +=========== + +For the time being, in order to use Enso, you need to add the root +directory of the Enso source tree to your PYTHONPATH environment +variable.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/enso/utils/strings.py Fri Feb 22 14:27:22 2008 -0600 @@ -0,0 +1,169 @@ +# Copyright (c) 2008, Humanized, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# 3. Neither the name of Enso nor the names of its contributors may +# be used to endorse or promote products derived from this +# software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY Humanized, Inc. ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL Humanized, Inc. BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# ---------------------------------------------------------------------------- +# +# enso.utils.strings +# +# ---------------------------------------------------------------------------- + +""" + Various string utility methods. +""" + +# ---------------------------------------------------------------------------- +# Constants +# ---------------------------------------------------------------------------- + +# Double "smart quotes". +OPEN_QUOTE = u"\u201C" +CLOSE_QUOTE = u"\u201D" + +# Single "smart quotes". +OPEN_SINGLE_QUOTE = u"\u2018" +CLOSE_SINGLE_QUOTE = u"\u2019" + + +# ---------------------------------------------------------------------------- +# String utility functions +# ---------------------------------------------------------------------------- + +def smartQuote( text ): + """ + Replaces regular quotes in text with "smart quotes", i.e., left and right + facing quotes, and returns the result as a unicode object. + + NOTE: This uses a very simple algorithm; if you are trying to quote + an arbitrary chunk of text, it would be best to use this function + on your formatting string, e.g., use this on: + ' %s ' - output from blah command + before you apply the formatting operation that dumps unknown text. + """ + + text = _smartDoubleQuote( text ) + text = _smartSingleQuote( text ) + + return text + + +def _smartSingleQuote( inText ): + """ + Replaces single quotes with "smart quotes", i.e., forward + and back facing quotes, except for single quotes that are + parts of certain contractions. + """ + + # Explicitly copy the text and cast it to unicode. + outText = unicode( inText[:] ) + + # There are two usages of single quote marks; for + # quotations, and for contractions. + + # First, we escape the contraction cases. Then, + # without those pesky apostrophes, we will be free + # and clear to replace the remaining single quotes + # with smart quotes. + + cases = [ "'s", "'t", "'nt", "I'm", "'ve", "'re", ] + for case in cases: + tempText = "<<|%s|>>" % case.replace( "'", "" ) + outText = outText.replace( case, tempText ) + + # Now that there are no apostrophes, we can run through + # the text, replacing each pair of single quotes with + # opening and closing 'smart single quotes'. + while outText.count( "'" ) > 0: + outText = outText.replace( "'", OPEN_SINGLE_QUOTE, 1) + outText = outText.replace( "'", CLOSE_SINGLE_QUOTE, 1) + + # Now we have to replace the contraction escape sequences + # with the original contractions. + for case in cases: + tempText = "<<|%s|>>" % case.replace( "'", "" ) + outText = outText.replace( tempText, case ) + + return outText + + +def _smartDoubleQuote( inText ): + """ + Replaces double quotes with "smart quotes", i.e., forward + and back facing quotes. + """ + + # Explicitly copy the text and cast it to unicode. + outText = unicode( inText[:] ) + while outText.count( "\"" ) > 0: + outText = outText.replace( "\"", OPEN_QUOTE, 1) + outText = outText.replace( "\"", CLOSE_QUOTE, 1) + return outText + + +def stringRatio( a, b ): + """ + Calculates the string ratio of a to b. + + If the strings are equal, returns 1.0. If they have no similarity + whatsoever, returns 0.0. Otherwise, returns a number in-between. + """ + + if a == b: + return 1.0 + elif a in b: + return float( len(a) ) / len(b) + elif b in a: + return float( len(b) ) / len(a) + else: + # The following code is actually identical to this code: + # + # import difflib + # seqMatch = difflib.SequenceMatcher( False, a, b ) + # ratio = seqMatch.real_quick_ratio() + # return ratio + # + # But has been copied from difflib and pasted inline here for + # efficiency purposes. + + la, lb = len(a), len(b) + + length = la + lb + if length: + return 2.0 * (min(la, lb)) / length + return 1.0 + + +def stringRatioBestMatch( item, sequence ): + """ + Uses a string ratio algorithm to find to the best match + to item among the elements of sequence. + """ + + ratios = [ stringRatio( item, element ) \ + for element in sequence ] + + return sequence[ ratios.index( min(ratios) ) ]
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/test_strings.py Fri Feb 22 14:27:22 2008 -0600 @@ -0,0 +1,82 @@ +""" + Tests for the enso.utils.strings module. +""" + +# ---------------------------------------------------------------------------- +# Imports +# ---------------------------------------------------------------------------- + +import random +import unittest + +from enso.utils.strings import smartQuote +from enso.utils.strings import stringRatio +from enso.utils.strings import stringRatioBestMatch +from enso.utils.strings import CLOSE_QUOTE +from enso.utils.strings import OPEN_QUOTE +from enso.utils.strings import CLOSE_SINGLE_QUOTE +from enso.utils.strings import OPEN_SINGLE_QUOTE + + +# ---------------------------------------------------------------------------- +# Unit Tests +# ---------------------------------------------------------------------------- + +class SmartQuoteTest( unittest.TestCase ): + NULL_CASES = [ + "I can't.", + "You're can't.", + "I'm sick of you're inability.", + "She's sick of he's being sick.", + "I've been Elmer's gran-dad.", + "He's sick of this.", + ] + + def testNullCases( self ): + for testString in self.NULL_CASES: + self.failUnlessEqual( testString, + smartQuote( testString ) ) + + CASES = [ + ( "\"ABC\"", OPEN_QUOTE + "ABC" + CLOSE_QUOTE ), + ( "A\"BC\"", "A" + OPEN_QUOTE + "BC" + CLOSE_QUOTE ), + ( "A\"B\"C", "A" + OPEN_QUOTE + "B" + CLOSE_QUOTE + "C" ), + ( "'ABC'", + OPEN_SINGLE_QUOTE + "ABC" + CLOSE_SINGLE_QUOTE ), + ( "A'BC'", + "A" + OPEN_SINGLE_QUOTE + "BC" + CLOSE_SINGLE_QUOTE ), + ( "A'B'C", + "A" + OPEN_SINGLE_QUOTE + "B" + CLOSE_SINGLE_QUOTE + "C" ), + ] + + def testCases( self ): + for source, target in self.CASES: + self.failUnlessEqual( target, + smartQuote( source ) ) + + +class StringComparisonTests( unittest.TestCase ): + def setUp( self ): + random.seed( 0 ) + + + def testRandomIdentities( self ): + MAX_LENGTH = 30 + CHARS = "abcdefghijklmnopqrstuvwxyz" + CHARS += CHARS.upper() + CHARS += "`1234567890-=~!@#$%^&*()_+[]\\{}|;':\",./<>?" + for i in range( MAX_LENGTH ): + for j in range( MAX_LENGTH ): + string = random.sample( CHARS, i ) + string += random.sample( CHARS, j ) + string = "".join( string ) + self.failUnlessEqual( stringRatio( string[:], string[:] ), + 1 ) + + +# ---------------------------------------------------------------------------- +# Script +# ---------------------------------------------------------------------------- + +if __name__ == "__main__": + unittest.main()