Ticket #7027: fix_tag_translation-refactored-re_7027.diff

File fix_tag_translation-refactored-re_7027.diff, 4.5 KB (added by mrts, 16 years ago)

Refactored the regex and wrote an explanation

  • django/utils/text.py

     
    197197    return str(ustring_re.sub(fix, s))
    198198javascript_quote = allow_lazy(javascript_quote, unicode)
    199199
    200 smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)')
     200# There are three match blocks (A|B|C). A and B have identical structure,
     201# differing only by the matched quotation marks. These blocks catch anything
     202# in quotes as a single match group, optionally surrounded by `_()`.
     203# C is the uninteresting, trivial case -- it matches any non-empty sequence
     204# of non-space characters.
     205#
     206# Block A should be read as follows, where `` mark string literals and
     207# NG stands for "non-grouping".
     208#
     209# Block B should be read the same way, replacing `'` for `"`.
     210#
     211# Reading of block A:
     212#
     213# (?:_\()? -- NG optional match ugettext start marker `_(` in the beginning,
     214#             NG needed as `_(` is a digraph
     215#
     216# `"` -- match string start marker
     217#
     218# (?:[^"\\]|\\.)+ -- NG match for a non-empty sequence of either
     219#                    * any character except `"` or `\`,
     220#                    * or digraph `\` followed by any single character
     221#
     222# `"` -- match string end marker
     223#
     224# \)? -- match optional ugettext `)` in the end
     225#
     226# Note that this lets throug both `" ")` and `\(" "`, avoiding these
     227# would make the regex needlessly complex.
     228smart_split_re = re.compile(
     229        r'((?:_\()?"(?:[^"\\]|\\.)+"\)?' # block A, for "foo bar"
     230        r"|(?:_\()?'(?:[^'\\]|\\.)+'\)?" # block B, for 'foo bar'
     231        r"|[^\s]+)" # block C, anything other without whitespace
     232)
    201233def smart_split(text):
    202234    r"""
    203235    Generator that splits a string by spaces, leaving quoted phrases together.
    204236    Supports both single and double quotes, and supports escaping quotes with
    205237    backslashes. In the output, strings will keep their initial and trailing
    206     quote marks.
     238    quote marks. Also, gettext markers '_(', ')' are preserved.
    207239
    208240    >>> list(smart_split(r'This is "a person\'s" test.'))
    209241    [u'This', u'is', u'"a person\\\'s"', u'test.']
    210         >>> list(smart_split(r"Another 'person\'s' test.")) 
     242        >>> list(smart_split(r"Another 'person\'s' test."))
    211243        [u'Another', u"'person's'", u'test.']
    212         >>> list(smart_split(r'A "\"funky\" style" test.')) 
     244        >>> list(smart_split(r'A "\"funky\" style" test.'))
    213245        [u'A', u'""funky" style"', u'test.']
     246    >>> list(smart_split(' _("my quoted string") '))
     247    [u'_("my quoted string")']
     248    >>> list(smart_split(" _('my quoted string') "))
     249    [u"_('my quoted string')"]
    214250    """
    215251    text = force_unicode(text)
    216252    for bit in smart_split_re.finditer(text):
    217253        bit = bit.group(0)
    218         if bit[0] == '"' and bit[-1] == '"':
    219             yield '"' + bit[1:-1].replace('\\"', '"').replace('\\\\', '\\') + '"'
    220         elif bit[0] == "'" and bit[-1] == "'":
    221             yield "'" + bit[1:-1].replace("\\'", "'").replace("\\\\", "\\") + "'"
     254        prefix, suffix = '', ''
     255        start, end = 1, -1
     256        if bit[0:2] == '_(' and bit[-1] == ')':
     257            prefix, suffix = '_(', ')'
     258            start, end = 3, -2
     259        if (bit[0] == '"' and bit[-1] == '"'
     260                or bit[0:3] == '_("' and bit[-2:] == '")'):
     261            yield '%s"%s"%s' % (prefix,
     262                    bit[start:end].replace(r'\"', '"').replace(r'\\', '\\'),
     263                    suffix)
     264        elif (bit[0] == "'" and bit[-1] == "'"
     265                or bit[0:3] == "_('" and bit[-2:] == "')"):
     266            yield "%s'%s'%s" % (prefix,
     267                    bit[start:end].replace(r"\'", "'").replace(r'\\', '\\'),
     268                    suffix)
    222269        else:
    223270            yield bit
    224271smart_split = allow_lazy(smart_split, unicode)
  • tests/regressiontests/text/tests.py

     
    1515[u'"a', u"'one"]
    1616>>> print list(smart_split(r'''all friends' tests'''))[1]
    1717friends'
     18>>> list(smart_split(' _("my quoted string") '))
     19[u'_("my quoted string")']
     20>>> list(smart_split(" _('my quoted string') "))
     21[u"_('my quoted string')"]
     22>>> print list(smart_split(" _('my \"quoted\" string') "))[0]
     23_('my "quoted" string')
    1824
    1925### urlquote #############################################################
    2026>>> from django.utils.http import urlquote, urlquote_plus
Back to Top