Source code for biryani.strings

# -*- coding: utf-8 -*-


# Biryani -- A conversion and validation toolbox
# By: Emmanuel Raviart <emmanuel@raviart.com>
#
# Copyright (C) 2009, 2010, 2011, 2012, 2013, 2014, 2015 Emmanuel Raviart
# http://packages.python.org/Biryani/
#
# This file is part of Biryani.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


"""Strings simplification functions"""


import unicodedata


__all__ = [
    'deep_decode',
    'deep_encode',
    'lower',
    'normalize',
    'slugify',
    'upper',
    ]

ASCII_TRANSLATIONS = {
    u'\N{NO-BREAK SPACE}': ' ',
    u'\N{LATIN CAPITAL LETTER A WITH ACUTE}': 'A',
    u'\N{LATIN CAPITAL LETTER A WITH CIRCUMFLEX}': 'A',
    u'\N{LATIN CAPITAL LETTER A WITH DIAERESIS}': 'A',
    u'\N{LATIN CAPITAL LETTER A WITH GRAVE}': 'A',
    u'\N{LATIN CAPITAL LETTER A WITH RING ABOVE}': 'A',
    u'\N{LATIN CAPITAL LETTER A WITH TILDE}': 'A',
    u'\N{LATIN CAPITAL LETTER AE}': 'Ae',
    u'\N{LATIN CAPITAL LETTER C WITH CEDILLA}': 'C',
    u'\N{LATIN CAPITAL LETTER E WITH ACUTE}': 'E',
    u'\N{LATIN CAPITAL LETTER E WITH CIRCUMFLEX}': 'E',
    u'\N{LATIN CAPITAL LETTER E WITH DIAERESIS}': 'E',
    u'\N{LATIN CAPITAL LETTER E WITH GRAVE}': 'E',
    u'\N{LATIN CAPITAL LETTER ETH}': 'Th',
    u'\N{LATIN CAPITAL LETTER I WITH ACUTE}': 'I',
    u'\N{LATIN CAPITAL LETTER I WITH CIRCUMFLEX}': 'I',
    u'\N{LATIN CAPITAL LETTER I WITH DIAERESIS}': 'I',
    u'\N{LATIN CAPITAL LETTER I WITH GRAVE}': 'I',
    u'\N{LATIN CAPITAL LETTER N WITH TILDE}': 'N',
    u'\N{LATIN CAPITAL LETTER O WITH ACUTE}': 'O',
    u'\N{LATIN CAPITAL LETTER O WITH CIRCUMFLEX}': 'O',
    u'\N{LATIN CAPITAL LETTER O WITH DIAERESIS}': 'O',
    u'\N{LATIN CAPITAL LETTER O WITH GRAVE}': 'O',
    u'\N{LATIN CAPITAL LETTER O WITH STROKE}': 'O',
    u'\N{LATIN CAPITAL LETTER O WITH TILDE}': 'O',
    u'\N{LATIN CAPITAL LIGATURE OE}': 'Oe',
    u'\N{LATIN CAPITAL LETTER THORN}': 'th',
    u'\N{LATIN CAPITAL LETTER U WITH ACUTE}': 'U',
    u'\N{LATIN CAPITAL LETTER U WITH CIRCUMFLEX}': 'U',
    u'\N{LATIN CAPITAL LETTER U WITH DIAERESIS}': 'U',
    u'\N{LATIN CAPITAL LETTER U WITH GRAVE}': 'U',
    u'\N{LATIN CAPITAL LETTER Y WITH ACUTE}': 'Y',
    u'\N{LATIN SMALL LETTER A WITH ACUTE}': 'a',
    u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}': 'a',
    u'\N{LATIN SMALL LETTER A WITH DIAERESIS}': 'a',
    u'\N{LATIN SMALL LETTER A WITH GRAVE}': 'a',
    u'\N{LATIN SMALL LETTER A WITH RING ABOVE}': 'a',
    u'\N{LATIN SMALL LETTER A WITH TILDE}': 'a',
    u'\N{LATIN SMALL LETTER AE}': 'ae',
    u'\N{LATIN SMALL LETTER C WITH CEDILLA}': 'c',
    u'\N{LATIN SMALL LETTER E WITH ACUTE}': 'e',
    u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}': 'e',
    u'\N{LATIN SMALL LETTER E WITH DIAERESIS}': 'e',
    u'\N{LATIN SMALL LETTER E WITH GRAVE}': 'e',
    u'\N{LATIN SMALL LETTER ETH}': 'th',
    u'\N{LATIN SMALL LETTER I WITH ACUTE}': 'i',
    u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}': 'i',
    u'\N{LATIN SMALL LETTER I WITH DIAERESIS}': 'i',
    u'\N{LATIN SMALL LETTER I WITH GRAVE}': 'i',
    u'\N{LATIN SMALL LETTER N WITH TILDE}': 'n',
    u'\N{LATIN SMALL LETTER O WITH ACUTE}': 'o',
    u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}': 'o',
    u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': 'o',
    u'\N{LATIN SMALL LETTER O WITH GRAVE}': 'o',
    u'\N{LATIN SMALL LETTER O WITH STROKE}': 'o',
    u'\N{LATIN SMALL LETTER O WITH TILDE}': 'o',
    u'\N{LATIN SMALL LIGATURE OE}': 'oe',
    u'\N{LATIN SMALL LETTER SHARP S}': 'ss',
    u'\N{LATIN SMALL LETTER THORN}': 'th',
    u'\N{LATIN SMALL LETTER U WITH ACUTE}': 'u',
    u'\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}': 'u',
    u'\N{LATIN SMALL LETTER U WITH DIAERESIS}': 'u',
    u'\N{LATIN SMALL LETTER U WITH GRAVE}': 'u',
    u'\N{LATIN SMALL LETTER Y WITH ACUTE}': 'y',
    u'\N{LATIN SMALL LETTER Y WITH DIAERESIS}': 'y',
    u'\N{LEFT SINGLE QUOTATION MARK}': "'",
    u'\N{RIGHT SINGLE QUOTATION MARK}': "'",
    }


[docs]def deep_decode(value, encoding = 'utf-8'): """Convert recursively bytes strings embedded in Python data to unicode strings. >>> deep_decode('Hello world!') u'Hello world!' >>> deep_decode(dict(a = 'b', c = ['d', 'e'])) {u'a': u'b', u'c': [u'd', u'e']} >>> deep_decode(u'Hello world!') u'Hello world!' >>> deep_decode(42) 42 >>> print deep_decode(None) None """ return value if isinstance(value, unicode) else value.decode(encoding) if isinstance(value, str) \ else dict( (deep_decode(name, encoding = encoding), deep_decode(item, encoding = encoding)) for name, item in value.iteritems() ) if isinstance(value, dict) \ else [ deep_decode(item, encoding = encoding) for item in value ] if isinstance(value, list) \ else tuple( deep_decode(item, encoding = encoding) for item in value ) if isinstance(value, tuple) \ else value
[docs]def deep_encode(value, encoding = 'utf-8'): """Convert recursively unicode strings embedded in Python data to encoded strings. >>> deep_encode(u'Hello world!') 'Hello world!' >>> deep_encode({u'a': u'b', u'c': [u'd', u'e']}) {'a': 'b', 'c': ['d', 'e']} >>> deep_encode('Hello world!') 'Hello world!' >>> deep_encode(42) 42 >>> print deep_encode(None) None """ return value if isinstance(value, str) else value.encode(encoding) if isinstance(value, unicode) \ else dict( (deep_encode(name, encoding = encoding), deep_encode(item, encoding = encoding)) for name, item in value.iteritems() ) if isinstance(value, dict) \ else [ deep_encode(item, encoding = encoding) for item in value ] if isinstance(value, list) \ else tuple( deep_encode(item, encoding = encoding) for item in value ) if isinstance(value, tuple) \ else value
[docs]def lower(s): """Convert a string to lower case. .. note:: This method is equivalent to the ``lower()`` method of strings, but can be used when a function is expected, for example by the :func:`normalize` & :func:`slugify` functions. >>> lower('Hello world!') 'hello world!' >>> lower(u'Hello world!') u'hello world!' >>> print lower(None) None """ if s is None: return None return s.lower()
[docs]def normalize(s, encoding = 'utf-8', separator = u' ', transform = lower): """Convert a string to its normal form using compatibility decomposition and removing combining characters. >>> normalize(u'Hello world!') u'hello world!' >>> normalize(u' Hello world! ') u'hello world!' >>> normalize('œil, forêt, ça, où...') u'\u0153il, foret, ca, ou...' >>> normalize('Hello world!') u'hello world!' >>> normalize(u' ') u'' >>> print normalize(None) None """ if s is None: return None if isinstance(s, str): s = s.decode(encoding) assert isinstance(s, unicode), str((s,)) normalized = u''.join(c for c in unicodedata.normalize('NFKD', s) if unicodedata.combining(c) == 0) normalized = separator.join(normalized.strip().split()) if transform is not None: normalized = transform(normalized) return normalized
[docs]def slugify(s, encoding = 'utf-8', separator = u'-', transform = lower): """Simplify a string, converting it to a lowercase ASCII subset. >>> slugify(u'Hello world!') u'hello-world' >>> slugify(u' Hello world! ') u'hello-world' >>> slugify('œil, forêt, ça, où...') u'oeil-foret-ca-ou' >>> slugify('Hello world!') u'hello-world' >>> print slugify(None) None """ if s is None: return None if isinstance(s, str): s = s.decode(encoding) assert isinstance(s, unicode), str((s,)) simplified = u''.join([slugify_char(unicode_char) for unicode_char in s]) while u' ' in simplified: simplified = simplified.replace(u' ', u' ') simplified = simplified.strip() if separator != u' ': simplified = simplified.replace(u' ', separator) if transform is not None: simplified = transform(simplified) return simplified
def slugify_char(unicode_char): """Convert an unicode character to a subset of uppercase ASCII characters or an empty string. The result can be composed of several characters (for example, 'œ' becomes 'OE'). """ chars = unicode_char_to_ascii(unicode_char) if chars: chars = chars.upper() split_chars = [] for char in chars: if char not in ' 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ': char = ' ' split_chars.append(char) chars = ''.join(split_chars) return chars def unicode_char_to_ascii(unicode_char): """Convert an unicode character to several ASCII characters""" chars = ASCII_TRANSLATIONS.get(unicode_char) if chars is None: if ord(unicode_char) < 0x80: chars = str(unicode_char) else: chars = '' return chars
[docs]def upper(s): """Convert a string to upper case. .. note:: This method is equivalent to the ``upper()`` method of strings, but can be used when a function is expected, for example by the :func:`normalize` & :func:`slugify` functions. >>> upper('Hello world!') 'HELLO WORLD!' >>> upper(u'Hello world!') u'HELLO WORLD!' >>> print upper(None) None """ if s is None: return None return s.upper()