view enso/utils/strings.py @ 11:bfca1c0eccea

Added enso.utils.strings and unit tests for it.
author Atul Varma <varmaa@toolness.com>
date Fri, 22 Feb 2008 14:27:22 -0600
parents
children
line wrap: on
line source

# Copyright (c) 2008, Humanized, Inc.
# All rights reserved.
# 
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#    1. Redistributions of source code must retain the above copyright
#       notice, this list of conditions and the following disclaimer.
#
#    2. Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#
#    3. Neither the name of Enso nor the names of its contributors may
#       be used to endorse or promote products derived from this
#       software without specific prior written permission.
# 
# THIS SOFTWARE IS PROVIDED BY Humanized, Inc. ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL Humanized, Inc. BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

# ----------------------------------------------------------------------------
#
#   enso.utils.strings
#
# ----------------------------------------------------------------------------

"""
    Various string utility methods.
"""

# ----------------------------------------------------------------------------
# Constants
# ----------------------------------------------------------------------------

# Double "smart quotes".
OPEN_QUOTE = u"\u201C"
CLOSE_QUOTE = u"\u201D"

# Single "smart quotes".
OPEN_SINGLE_QUOTE = u"\u2018"
CLOSE_SINGLE_QUOTE = u"\u2019"


# ----------------------------------------------------------------------------
# String utility functions
# ----------------------------------------------------------------------------

def smartQuote( text ):
    """
    Replaces regular quotes in text with "smart quotes", i.e., left and right
    facing quotes, and returns the result as a unicode object.

    NOTE: This uses a very simple algorithm; if you are trying to quote
    an arbitrary chunk of text, it would be best to use this function
    on your formatting string, e.g., use this on:
        ' %s ' - output from blah command
    before you apply the formatting operation that dumps unknown text.
    """

    text = _smartDoubleQuote( text )
    text = _smartSingleQuote( text )

    return text


def _smartSingleQuote( inText ):
    """
    Replaces single quotes with "smart quotes", i.e., forward
    and back facing quotes, except for single quotes that are
    parts of certain contractions.
    """
    
    # Explicitly copy the text and cast it to unicode.
    outText = unicode( inText[:] )

    # There are two usages of single quote marks; for
    # quotations, and for contractions.

    # First, we escape the contraction cases.  Then,
    # without those pesky apostrophes, we will be free
    # and clear to replace the remaining single quotes
    # with smart quotes.

    cases = [ "'s", "'t", "'nt", "I'm", "'ve", "'re", ]
    for case in cases:
        tempText = "<<|%s|>>" % case.replace( "'", "" )
        outText = outText.replace( case, tempText )

    # Now that there are no apostrophes, we can run through
    # the text, replacing each pair of single quotes with
    # opening and closing 'smart single quotes'.
    while outText.count( "'" ) > 0:
        outText = outText.replace( "'", OPEN_SINGLE_QUOTE, 1)
        outText = outText.replace( "'", CLOSE_SINGLE_QUOTE, 1)

    # Now we have to replace the contraction escape sequences
    # with the original contractions.
    for case in cases:
        tempText = "<<|%s|>>" % case.replace( "'", "" )
        outText = outText.replace( tempText, case )

    return outText


def _smartDoubleQuote( inText ):
    """
    Replaces double quotes with "smart quotes", i.e., forward
    and back facing quotes.
    """
    
    # Explicitly copy the text and cast it to unicode.
    outText = unicode( inText[:] )
    while outText.count( "\"" ) > 0:
        outText = outText.replace( "\"", OPEN_QUOTE, 1)
        outText = outText.replace( "\"", CLOSE_QUOTE, 1)
    return outText
    

def stringRatio( a, b ):
    """
    Calculates the string ratio of a to b.

    If the strings are equal, returns 1.0.  If they have no similarity
    whatsoever, returns 0.0.  Otherwise, returns a number in-between.
    """

    if a == b:
        return 1.0
    elif a in b:
        return float( len(a) ) / len(b)
    elif b in a:
        return float( len(b) ) / len(a)
    else:
        # The following code is actually identical to this code:
        #
        #  import difflib
        #  seqMatch = difflib.SequenceMatcher( False, a, b )
        #  ratio = seqMatch.real_quick_ratio()
        #  return ratio
        #
        # But has been copied from difflib and pasted inline here for
        # efficiency purposes.
        
        la, lb = len(a), len(b)

        length = la + lb
        if length:
            return 2.0 * (min(la, lb)) / length
        return 1.0


def stringRatioBestMatch( item, sequence ):
    """
    Uses a string ratio algorithm to find to the best match
    to item among the elements of sequence.
    """

    ratios = [ stringRatio( item, element ) \
               for element in sequence ]

    return sequence[ ratios.index( min(ratios) ) ]