# -*- coding: utf-8 -*-
# Biryani -- A conversion and validation toolbox
# By: Emmanuel Raviart <emmanuel@raviart.com>
#
# Copyright (C) 2009, 2010, 2011, 2012, 2013, 2014, 2015 Emmanuel Raviart
# http://packages.python.org/Biryani/
#
# This file is part of Biryani.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Strings simplification functions"""
import unicodedata
__all__ = [
'deep_decode',
'deep_encode',
'lower',
'normalize',
'slugify',
'upper',
]
ASCII_TRANSLATIONS = {
u'\N{NO-BREAK SPACE}': ' ',
u'\N{LATIN CAPITAL LETTER A WITH ACUTE}': 'A',
u'\N{LATIN CAPITAL LETTER A WITH CIRCUMFLEX}': 'A',
u'\N{LATIN CAPITAL LETTER A WITH DIAERESIS}': 'A',
u'\N{LATIN CAPITAL LETTER A WITH GRAVE}': 'A',
u'\N{LATIN CAPITAL LETTER A WITH RING ABOVE}': 'A',
u'\N{LATIN CAPITAL LETTER A WITH TILDE}': 'A',
u'\N{LATIN CAPITAL LETTER AE}': 'Ae',
u'\N{LATIN CAPITAL LETTER C WITH CEDILLA}': 'C',
u'\N{LATIN CAPITAL LETTER E WITH ACUTE}': 'E',
u'\N{LATIN CAPITAL LETTER E WITH CIRCUMFLEX}': 'E',
u'\N{LATIN CAPITAL LETTER E WITH DIAERESIS}': 'E',
u'\N{LATIN CAPITAL LETTER E WITH GRAVE}': 'E',
u'\N{LATIN CAPITAL LETTER ETH}': 'Th',
u'\N{LATIN CAPITAL LETTER I WITH ACUTE}': 'I',
u'\N{LATIN CAPITAL LETTER I WITH CIRCUMFLEX}': 'I',
u'\N{LATIN CAPITAL LETTER I WITH DIAERESIS}': 'I',
u'\N{LATIN CAPITAL LETTER I WITH GRAVE}': 'I',
u'\N{LATIN CAPITAL LETTER N WITH TILDE}': 'N',
u'\N{LATIN CAPITAL LETTER O WITH ACUTE}': 'O',
u'\N{LATIN CAPITAL LETTER O WITH CIRCUMFLEX}': 'O',
u'\N{LATIN CAPITAL LETTER O WITH DIAERESIS}': 'O',
u'\N{LATIN CAPITAL LETTER O WITH GRAVE}': 'O',
u'\N{LATIN CAPITAL LETTER O WITH STROKE}': 'O',
u'\N{LATIN CAPITAL LETTER O WITH TILDE}': 'O',
u'\N{LATIN CAPITAL LIGATURE OE}': 'Oe',
u'\N{LATIN CAPITAL LETTER THORN}': 'th',
u'\N{LATIN CAPITAL LETTER U WITH ACUTE}': 'U',
u'\N{LATIN CAPITAL LETTER U WITH CIRCUMFLEX}': 'U',
u'\N{LATIN CAPITAL LETTER U WITH DIAERESIS}': 'U',
u'\N{LATIN CAPITAL LETTER U WITH GRAVE}': 'U',
u'\N{LATIN CAPITAL LETTER Y WITH ACUTE}': 'Y',
u'\N{LATIN SMALL LETTER A WITH ACUTE}': 'a',
u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}': 'a',
u'\N{LATIN SMALL LETTER A WITH DIAERESIS}': 'a',
u'\N{LATIN SMALL LETTER A WITH GRAVE}': 'a',
u'\N{LATIN SMALL LETTER A WITH RING ABOVE}': 'a',
u'\N{LATIN SMALL LETTER A WITH TILDE}': 'a',
u'\N{LATIN SMALL LETTER AE}': 'ae',
u'\N{LATIN SMALL LETTER C WITH CEDILLA}': 'c',
u'\N{LATIN SMALL LETTER E WITH ACUTE}': 'e',
u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}': 'e',
u'\N{LATIN SMALL LETTER E WITH DIAERESIS}': 'e',
u'\N{LATIN SMALL LETTER E WITH GRAVE}': 'e',
u'\N{LATIN SMALL LETTER ETH}': 'th',
u'\N{LATIN SMALL LETTER I WITH ACUTE}': 'i',
u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}': 'i',
u'\N{LATIN SMALL LETTER I WITH DIAERESIS}': 'i',
u'\N{LATIN SMALL LETTER I WITH GRAVE}': 'i',
u'\N{LATIN SMALL LETTER N WITH TILDE}': 'n',
u'\N{LATIN SMALL LETTER O WITH ACUTE}': 'o',
u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}': 'o',
u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': 'o',
u'\N{LATIN SMALL LETTER O WITH GRAVE}': 'o',
u'\N{LATIN SMALL LETTER O WITH STROKE}': 'o',
u'\N{LATIN SMALL LETTER O WITH TILDE}': 'o',
u'\N{LATIN SMALL LIGATURE OE}': 'oe',
u'\N{LATIN SMALL LETTER SHARP S}': 'ss',
u'\N{LATIN SMALL LETTER THORN}': 'th',
u'\N{LATIN SMALL LETTER U WITH ACUTE}': 'u',
u'\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}': 'u',
u'\N{LATIN SMALL LETTER U WITH DIAERESIS}': 'u',
u'\N{LATIN SMALL LETTER U WITH GRAVE}': 'u',
u'\N{LATIN SMALL LETTER Y WITH ACUTE}': 'y',
u'\N{LATIN SMALL LETTER Y WITH DIAERESIS}': 'y',
u'\N{LEFT SINGLE QUOTATION MARK}': "'",
u'\N{RIGHT SINGLE QUOTATION MARK}': "'",
}
[docs]def deep_decode(value, encoding = 'utf-8'):
"""Convert recursively bytes strings embedded in Python data to unicode strings.
>>> deep_decode('Hello world!')
u'Hello world!'
>>> deep_decode(dict(a = 'b', c = ['d', 'e']))
{u'a': u'b', u'c': [u'd', u'e']}
>>> deep_decode(u'Hello world!')
u'Hello world!'
>>> deep_decode(42)
42
>>> print deep_decode(None)
None
"""
return value if isinstance(value, unicode) else value.decode(encoding) if isinstance(value, str) \
else dict(
(deep_decode(name, encoding = encoding), deep_decode(item, encoding = encoding))
for name, item in value.iteritems()
) if isinstance(value, dict) \
else [
deep_decode(item, encoding = encoding)
for item in value
] if isinstance(value, list) \
else tuple(
deep_decode(item, encoding = encoding)
for item in value
) if isinstance(value, tuple) \
else value
[docs]def deep_encode(value, encoding = 'utf-8'):
"""Convert recursively unicode strings embedded in Python data to encoded strings.
>>> deep_encode(u'Hello world!')
'Hello world!'
>>> deep_encode({u'a': u'b', u'c': [u'd', u'e']})
{'a': 'b', 'c': ['d', 'e']}
>>> deep_encode('Hello world!')
'Hello world!'
>>> deep_encode(42)
42
>>> print deep_encode(None)
None
"""
return value if isinstance(value, str) else value.encode(encoding) if isinstance(value, unicode) \
else dict(
(deep_encode(name, encoding = encoding), deep_encode(item, encoding = encoding))
for name, item in value.iteritems()
) if isinstance(value, dict) \
else [
deep_encode(item, encoding = encoding)
for item in value
] if isinstance(value, list) \
else tuple(
deep_encode(item, encoding = encoding)
for item in value
) if isinstance(value, tuple) \
else value
[docs]def lower(s):
"""Convert a string to lower case.
.. note:: This method is equivalent to the ``lower()`` method of strings, but can be used when a function is
expected, for example by the :func:`normalize` & :func:`slugify` functions.
>>> lower('Hello world!')
'hello world!'
>>> lower(u'Hello world!')
u'hello world!'
>>> print lower(None)
None
"""
if s is None:
return None
return s.lower()
[docs]def normalize(s, encoding = 'utf-8', separator = u' ', transform = lower):
"""Convert a string to its normal form using compatibility decomposition and removing combining characters.
>>> normalize(u'Hello world!')
u'hello world!'
>>> normalize(u' Hello world! ')
u'hello world!'
>>> normalize('œil, forêt, ça, où...')
u'\u0153il, foret, ca, ou...'
>>> normalize('Hello world!')
u'hello world!'
>>> normalize(u' ')
u''
>>> print normalize(None)
None
"""
if s is None:
return None
if isinstance(s, str):
s = s.decode(encoding)
assert isinstance(s, unicode), str((s,))
normalized = u''.join(c for c in unicodedata.normalize('NFKD', s) if unicodedata.combining(c) == 0)
normalized = separator.join(normalized.strip().split())
if transform is not None:
normalized = transform(normalized)
return normalized
[docs]def slugify(s, encoding = 'utf-8', separator = u'-', transform = lower):
"""Simplify a string, converting it to a lowercase ASCII subset.
>>> slugify(u'Hello world!')
u'hello-world'
>>> slugify(u' Hello world! ')
u'hello-world'
>>> slugify('œil, forêt, ça, où...')
u'oeil-foret-ca-ou'
>>> slugify('Hello world!')
u'hello-world'
>>> print slugify(None)
None
"""
if s is None:
return None
if isinstance(s, str):
s = s.decode(encoding)
assert isinstance(s, unicode), str((s,))
simplified = u''.join([slugify_char(unicode_char) for unicode_char in s])
while u' ' in simplified:
simplified = simplified.replace(u' ', u' ')
simplified = simplified.strip()
if separator != u' ':
simplified = simplified.replace(u' ', separator)
if transform is not None:
simplified = transform(simplified)
return simplified
def slugify_char(unicode_char):
"""Convert an unicode character to a subset of uppercase ASCII characters or an empty string.
The result can be composed of several characters (for example, 'œ' becomes 'OE').
"""
chars = unicode_char_to_ascii(unicode_char)
if chars:
chars = chars.upper()
split_chars = []
for char in chars:
if char not in ' 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ':
char = ' '
split_chars.append(char)
chars = ''.join(split_chars)
return chars
def unicode_char_to_ascii(unicode_char):
"""Convert an unicode character to several ASCII characters"""
chars = ASCII_TRANSLATIONS.get(unicode_char)
if chars is None:
if ord(unicode_char) < 0x80:
chars = str(unicode_char)
else:
chars = ''
return chars
[docs]def upper(s):
"""Convert a string to upper case.
.. note:: This method is equivalent to the ``upper()`` method of strings, but can be used when a function is
expected, for example by the :func:`normalize` & :func:`slugify` functions.
>>> upper('Hello world!')
'HELLO WORLD!'
>>> upper(u'Hello world!')
u'HELLO WORLD!'
>>> print upper(None)
None
"""
if s is None:
return None
return s.upper()