Source code for textseg

#! /usr/bin/env python
# -*- coding: utf-8 -*-
"""
The pytextseg package provides functions to wrap plain texts:
:func:`fill` and :func:`wrap` are Unicode-aware alternatives for those
of :mod:`textwrap` standard module; :func:`fold` and :func:`unfold` are
functions mainly focus on plain text messages such as e-mail.

It also provides lower level interfaces for text segmentation:
:class:`LineBreak` class for line breaking; :class:`GCStr` class for
grapheme cluster segmentation.
"""
# Copyright (C) 2012 by Hatuka*nezumi - IKEDA Soji.
#
# This file is part of the pytextseg package.  This program is free
# software; you can redistribute it and/or modify it under the terms of
# either the GNU General Public License or the Artistic License, as
# specified in the README file.

__all__ = ['Consts', 'GCStr', 'LineBreak', 'LineBreakException',
           'fill', 'fold', 'unfold', 'wrap']

import re
import _textseg
from textseg.Consts import lbcBK, lbcCR, lbcLF, lbcNL, lbcSP, eawN
try:
    from email.charset import Charset
except ImportError:
    from email.Charset import Charset

try:
    unicode, unichr
except NameError:
    unicode = str
    unichr = chr

###
### Function wrap()
###

[docs]def wrap(text, width = 70, initial_indent = "", subsequent_indent = "", expand_tabs = True, replace_whitespace = True, fix_sentence_endings = False, break_long_words = True, break_on_hyphens = True, drop_whitespace = True, **kwds): '''\ wrap(text[, options...]) -> [unicode] Wrap paragraphs of a text then return a list of wrapped lines. Reformat each paragraph in *text* so that it fits in lines of no more than *width* :term:`columns<number of columns>` if possible, and return a list of wrapped lines. By default, tabs in *text* are expanded and all other whitespace characters (including newline) are converted to space. See :mod:`textwrap` about options. .. note:: Some options take no effects on this module: *fix_sentence_endings*, *break_on_hyphens*, *drop_whitespace*. For other named arguments see instance attributes of :class:`LineBreak` class. ''' def format(self, action, s): if action.startswith('eo'): return self.newline if action in ('sot', 'sop'): return s * 0 + initial_indent + s if action == 'sol': return s * 0 + subsequent_indent + s if action == '': if s == initial_indent or s == subsequent_indent: return '' return None if expand_tabs: text = GCStr(text).expandtabs() if replace_whitespace: table = {} for c in unicode('\t\n\x0b\x0c\r '): table[c] = unicode(' ') if isinstance(text, GCStr): text = text * 0 + unicode(text).translate(table) else: text = text.translate(table) for k, v in list({ 'charmax': 0, 'format': format, 'newline': '', 'urgent': (break_long_words and "FORCE" or None), 'width': width, }.items()): kwds.setdefault(k, v) lb = LineBreak(**kwds) return [unicode(s) for s in lb.wrap(text)] ### ### Function fill() ###
[docs]def fill(text, **kwds): '''\ fill(text[, options...]) -> unicode Reformat the single paragraph in *text* to fit in lines of no more than *width* :term:`columns<number of columns>`, and return a new string containing the entire wrapped paragraph. Optional named arguments will be passed to :func:`wrap<textseg.wrap>` function.''' return unicode("\n").join(wrap(text, **kwds)) ### ### Function fold() ###
def _fold_FIXED(self, action, s): if action in ('sot', 'sop'): self['_'] = {} self['_']['width'] = self.width if s.startswith('>'): self.width = 0 elif action == '': self['_']['line'] = s elif action == 'eol': return self.newline elif action.startswith('eo'): if len(self['_']['line']) and self.width: s = self.newline + self.newline else: s = self.newline self.width = self['_']['width'] del self['_'] return s return None _prefixRe = re.compile(r'^>+') def _fold_FLOWED(self, action, s): if action == 'sol': if len(self['_']['prefix']): return self['_']['prefix'] + ' ' + s elif s.startswith(' ') or s.startswith('From ') or s.startswith('>'): return ' ' + s elif action.startswith('so'): self['_'] = {} m = _prefixRe.match(unicode(s)) if m: self['_']['prefix'] = m.group() else: self['_']['prefix'] = '' if s.startswith(' ') or s.startswith('From '): return ' ' + s elif action == '': self['_']['line'] = s elif action == 'eol': if len(s): s = ' ' return s + ' ' + self.newline elif action.startswith('eo'): if len(self['_']['line']) and not len(self['_']['prefix']): s = ' ' + self.newline + self.newline else: s = self.newline del self['_'] return s return None def _fold_PLAIN(self, action, s): if action.startswith('eo'): return self.newline return None _fold_funcs = {'flowed': _fold_FLOWED, 'fixed': _fold_FIXED, 'plain': _fold_PLAIN, } _specialBreakRe = re.compile('([' + unichr(0xB) + unichr(0xC) + unichr(0x85) + unichr(0x2028) + unichr(0x2029) + '])', re.S)
[docs]def fold(string, method = 'plain', tabsize = 8, charset = None, language = None, **kwds): """\ fold(string[, method, options...]) -> unicode Fold lines of string *string* to fit in lines of no more than *width* columns, and return it. Following options may be specified for *method* argument. ``"fixed"`` Lines preceded by ">" won't be folded. Paragraphs are separated by empty line. ``"flowed"`` "Format=Flowed; DelSp=Yes" formatting defined by :rfc:`3676`. ``"plain"`` Default method. All lines are folded. Surplus SPACEs and horizontal tabs at end of line are removed, newline sequences are replaced by that specified by optional *newline* argument and newline is appended at end of text if it does not exist. Horizontal tabs are treated as tab stops according to *tabsize* argument. *charset* or *language* is used to determine language/region context: East Asian or not. For other named arguments see instance attributes of :class:`LineBreak` class. """ def sizing(self, cols, pre, spc, s): spcstr = spc + s i = 0 for c in spcstr: if c.lbc != lbcSP: cols += spcstr[i:].cols break if c == "\t": if 0 < tabsize: cols += tabsize - (cols % tabsize) else: cols += c.cols i = i + 1 return cols if string is None or not len(string): return '' if not isinstance(string, unicode): string = unicode(string) if charset is not None: charset = Charset(charset).get_output_charset() eastasian_context = not not re.match( r'''big5|cp9\d\d|euc-|gb18030|gb2312|gbk|hz| iso-2022-|ks_c_5601|shift_jis''', charset, re.I + re.X) elif language is not None: eastasian_context = not not re.match( r'ain|ja\b|jpn|ko\b|kor|zh\b|chi', language, re.I) else: eastasian_context = None kwds.update({'format': _fold_funcs.get(method.lower(), _fold_PLAIN), 'sizing': sizing, }) if eastasian_context is not None: kwds['eastasian_context'] = eastasian_context lb = LineBreak(**kwds) lb.lbc["\t"] = lbcSP result = '' for s in _specialBreakRe.split(string): if not len(s): continue elif _specialBreakRe.match(s): result += s else: result += ''.join([unicode(l) for l in lb.wrap(s)]) return result ### ### function unfold() ###
[docs]def unfold(string, method = 'fixed', newline = "\n", **kwds): '''\ unfold(text[, method]) -> unicode Conjunct folded paragraphs of string STRING and returns it. Following options may be specified for *method* argument. ``"fixed"`` Default method. Lines preceded by ``">"`` won't be conjuncted. Treat empty line as paragraph separator. ``"flowed"`` Unfold "Format=Flowed; DelSp=Yes" formatting defined by :rfc:`3676`. ``"flowedsp"`` Unfold "Format=Flowed; DelSp=No" formatting defined by :rfc:`3676`. ''' def reMatch(regexp, s, pos): # keep result of matching as an attribute of function itself. reMatch.m = re.compile(regexp).match(s, pos) return reMatch.m if not isinstance(string, unicode): string = unicode(string) if not len(string): return string string = re.sub(r'\r\n|\r', "\n", string) method = method.lower() if method not in ('fixed', 'flowed', 'flowedsp'): method = 'fixed' delsp = (method == 'flowed') ## Do unfolding. result = ''; for s in _specialBreakRe.split(string): if s == '': continue elif _specialBreakRe.match(s): result += s continue elif method == 'fixed': lb = LineBreak(**kwds) pos = 0 while pos < len(s): if reMatch(r'\n', s, pos): result += newline elif reMatch(r'(.+)\n\n', s, pos): result += reMatch.m.group(1) + newline elif reMatch(r'(>.*)\n', s, pos): result += reMatch.m.group(1) + newline elif reMatch(r'(.+)\n(?=>)', s, pos): result += reMatch.m.group(1) + newline elif reMatch(r'(.+?)( *)\n(?=(.+))', s, pos): sl, ss, sn = reMatch.m.group(1, 2, 3) result += sl if sn.startswith(' '): result += newline elif len(ss): result += ss elif len(sl): if lb.breakingRule(sl, sn) == LineBreak.INDIRECT: result += ' ' elif reMatch(r'(.+)\n', s, pos): result += reMatch.m.group(1) + newline elif reMatch(r'(.+)', s, pos): result += reMatch.m.group(1) + newline break pos += len(reMatch.m.group(0)) else: prefix = None pos = 0 while pos < len(s): if reMatch(r'(>+) ?(.*?)( ?)\n', s, pos): sp, sl, ss = reMatch.m.group(1, 2, 3) if prefix is None: result += sp + ' ' + sl elif sp != prefix: result += newline + sp + ' ' + sl else: result += sl if not len(ss): result += newline prefix = None else: prefix = sp if not delsp: result += ss elif reMatch(r' ?(.*?)( ?)\n', s, pos): sl, ss = reMatch.m.group(1, 2) if prefix is None: result += sl elif prefix != '': result += newline + sl else: result += sl if not len(ss): result += newline prefix = None else: if not delsp: result += ss prefix = '' elif reMatch(r' ?(.*)', s, pos): result += reMatch.m.group(1) + newline break pos += len(reMatch.m.group(0)) return result ### ### Exception ### # On Python 2.4 or earlier, exception is classical class. So derive the # base object class to make sure that this exception is new style class.
[docs]class LineBreakException(_textseg.LineBreakException, object): '''\ See :attr:`urgent<textseg.LineBreak.urgent>` attribute of :class:`LineBreak` class. ''' pass ### ### Class for Line breaking ###
[docs]class LineBreak(_textseg.LineBreak): '''\ LineBreak class performs Line Breaking Algorithm described in Unicode Standard Annex #14 ([UAX14]_). :term:`East_Asian_Width` informative properties defined by Annex #11 ([UAX11]_) will be concerned to determine breaking positions. ''' MANDATORY = 4 DIRECT = 3 INDIRECT = 2 PROHIBITED = 1 #: Dictionary containing default values of instance attributes. DEFAULTS = { 'width': 70, 'minwidth': 0, 'charmax': 998, 'eastasian_context': False, 'eaw': None, 'format': 'SIMPLE', 'hangul_as_al': False, 'lbc': None, 'legacy_cm': True, 'newline': "\n", 'prep': None, 'sizing': "UAX11", 'urgent': None, 'virama_as_joiner': True, }
[docs] def __init__(self, **kwds): '''\ LineBreak([options...]) -> LineBreak Create new LineBreak object. Optional named arguments may specify initial attribute values. See documentations of instance attributes. Initial defaults are: *break_indent=False*, *charmax=998*, *eastasian_context=False*, *eaw=None*, *format="SIMPLE"*, *hangul_as_al=False*, *lbc=None*, *legacy_cm=True*, *minwidth=0*, *newline="\\\\n"*, *prep=[None]*, *sizing="UAX11"*, *urgent=None*, *virama_as_joiner=True*, *width=70* ''' for k, v in list(self.DEFAULTS.items()): kwds.setdefault(k, v) kwds.setdefault('linebreakType', type(self)) kwds.setdefault('gcstrType', GCStr) kwds.setdefault('exceptionType', LineBreakException) _textseg.LineBreak.__init__(self, **kwds) ### ### Class for Grapheme cluster string ###
[docs]class GCStr(_textseg.GCStr): '''\ GCStr class treats Unicode string as a sequence of :term:`extended grapheme clusters<grapheme cluster>` defined by Unicode Standard Annex #29 ([UAX29]_).''' PROHIBIT_BEFORE = 1 ALLOW_BEFORE = 2
[docs] def __new__(cls, string, lb = None): '''\ GCStr(string[, lb]) -> GCStr Create new grapheme cluster string (GCStr object) from Unicode string *string*. Optional LineBreak object *lb* controls breaking features. Following attributes of LineBreak object affect new GCStr object. - :attr:`eastasian_context<LineBreak.eastasian_context>` - :attr:`eaw<LineBreak.eaw>` - :attr:`lbc<LineBreak.lbc>` - :attr:`legacy_cm<LineBreak.legacy_cm>` - :attr:`virama_as_joiner<LineBreak.virama_as_joiner>` ''' if lb is None: lb = LineBreak() return _textseg.GCStr.__new__(cls, string, lb=lb)
[docs] def center(self, width, fillchar=' '): """\ S.center(width[, fillchar]) -> GCStr Return S centered in a string of *width* :term:`columns<number of columns>`. Padding is done using the specified fill character (default is a space)""" fillchar = self * 0 + fillchar if width < self.cols + fillchar.cols: return self marg = (width - self.cols) // fillchar.cols right = marg // 2; return fillchar * (marg - right) + self + fillchar * right
[docs] def endswith(self, suffix, start = 0, end = None): '''\ S.endswith(suffix[, start[, end]]) -> bool Return True if S ends with the specified suffix, False otherwise. With optional start, test S beginning at that position. With optional end, stop comparing S at that position. suffix can also be a tuple of strings to try.''' if isinstance(suffix, tuple): pass elif not isinstance(suffix, unicode): prefix = unicode(suffix) if end is None: return unicode(self[start:]).endswith(suffix) else: return unicode(self[start:end]).endswith(suffix)
[docs] def expandtabs(self, tabsize=8): '''\ S.expandtabs([tabsize]) -> GCStr Return a copy of S where all tab characters are expanded using spaces. If *tabsize* is not given, a tab size of 8 columns is assumed.''' ret = self * 0 j = 0 for c in self: if c.lbc in (lbcBK, lbcCR, lbcLF, lbcNL): ret += c j = 0 elif c == unicode('\t'): if 0 < tabsize: incr = tabsize - (j % tabsize) ret += unicode(' ') * incr j += incr else: ret += c j += c.cols return ret
[docs] def join(self, iterable): """\ S.join(iterable) -> GCStr Return a grapheme cluster string which is the concatenation of the strings in the *iterable*. The separator between elements is S.""" ret = self * 0 first = True for s in iterable: if not first: ret += self + s else: ret += s first = False return ret
[docs] def ljust(self, width, fillchar=' '): """\ S.ljust(width[, fillchar]) -> GCStr Return S left-justified in a grapheme cluster string of *width* :term:`columns<number of columns>`. Padding is done using the specified fill character (default is a space).""" fillchar = self * 0 + fillchar if width < self.cols + fillchar.cols: return self return self + fillchar * ((width - self.cols) // fillchar.cols)
[docs] def rjust(self, width, fillchar=' '): """\ S.rjust(width[, fillchar]) -> GCStr Return S right-justified in a string of *width* :term:`columns<number of columns>`. Padding is done using the specified fill character (default is a space).""" fillchar = self * 0 + fillchar if width < self.cols + fillchar.cols: return self return fillchar * ((width - self.cols) // fillchar.cols) + self
[docs] def splitlines(self, keepends = False): """\ S.splitlines([keepends]) -> [GCStr] Return a list of the lines in S, breaking at line boundaries. Line breaks are not included in the resulting list unless *keepends* is given and true. .. note:: U+001C, U+001D and U+001E are not included in linebreak characters. """ ret = [] str_len = len(self) i = 0 j = 0 while i < str_len: while i < str_len and \ self[i].lbc not in (lbcBK, lbcCR, lbcLF, lbcNL): i += 1 eol = i if i < str_len: i += 1 if keepends: eol = i ret.append(self[j:eol+1]) j = i i += 1 return ret
[docs] def startswith(self, prefix, start = 0, end = None): '''\ S.startswith(prefix[, start[, end]]) -> bool Return True if S starts with the specified prefix, False otherwise. With optional start, test S beginning at that position. With optional end, stop comparing S at that position. prefix can also be a tuple of strings to try.''' if isinstance(prefix, tuple): pass elif not isinstance(prefix, unicode): prefix = unicode(prefix) if end is None: return unicode(self[start:]).startswith(prefix) else: return unicode(self[start:end]).startswith(prefix)
""" def translate(self, table): '''\ S.translate(table) -> GCStr Return a copy of the GCStr object S, where all characters have been mapped through the given translation table, which must be a mapping of Unicode ordinals to Unicode ordinals, strings, or None. Unmapped characters are left untouched. Characters mapped to None are deleted.''' return self * 0 + unicode(self).translate(table) """