Django

Code

Ticket #7027: fix_tag_translation-refactored-re_7027.diff

File fix_tag_translation-refactored-re_7027.diff, 4.5 kB (added by mrts, 4 months ago)

Refactored the regex and wrote an explanation

  • django/utils/text.py

    old new  
    197197    return str(ustring_re.sub(fix, s)) 
    198198javascript_quote = allow_lazy(javascript_quote, unicode) 
    199199 
    200 smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)') 
     200# There are three match blocks (A|B|C). A and B have identical structure, 
     201# differing only by the matched quotation marks. These blocks catch anything 
     202# in quotes as a single match group, optionally surrounded by `_()`. 
     203# C is the uninteresting, trivial case -- it matches any non-empty sequence 
     204# of non-space characters. 
     205
     206# Block A should be read as follows, where `` mark string literals and 
     207# NG stands for "non-grouping". 
     208
     209# Block B should be read the same way, replacing `'` for `"`. 
     210
     211# Reading of block A: 
     212
     213# (?:_\()? -- NG optional match ugettext start marker `_(` in the beginning, 
     214#             NG needed as `_(` is a digraph 
     215
     216# `"` -- match string start marker 
     217
     218# (?:[^"\\]|\\.)+ -- NG match for a non-empty sequence of either 
     219#                    * any character except `"` or `\`, 
     220#                    * or digraph `\` followed by any single character 
     221
     222# `"` -- match string end marker 
     223
     224# \)? -- match optional ugettext `)` in the end 
     225
     226# Note that this lets throug both `" ")` and `\(" "`, avoiding these 
     227# would make the regex needlessly complex. 
     228smart_split_re = re.compile( 
     229        r'((?:_\()?"(?:[^"\\]|\\.)+"\)?' # block A, for "foo bar" 
     230        r"|(?:_\()?'(?:[^'\\]|\\.)+'\)?" # block B, for 'foo bar' 
     231        r"|[^\s]+)" # block C, anything other without whitespace 
     232
    201233def smart_split(text): 
    202234    r""" 
    203235    Generator that splits a string by spaces, leaving quoted phrases together. 
    204236    Supports both single and double quotes, and supports escaping quotes with 
    205237    backslashes. In the output, strings will keep their initial and trailing 
    206     quote marks. 
     238    quote marks. Also, gettext markers '_(', ')' are preserved. 
    207239 
    208240    >>> list(smart_split(r'This is "a person\'s" test.')) 
    209241    [u'This', u'is', u'"a person\\\'s"', u'test.'] 
    210         >>> list(smart_split(r"Another 'person\'s' test."))  
     242        >>> list(smart_split(r"Another 'person\'s' test.")) 
    211243        [u'Another', u"'person's'", u'test.'] 
    212         >>> list(smart_split(r'A "\"funky\" style" test.'))  
     244        >>> list(smart_split(r'A "\"funky\" style" test.')) 
    213245        [u'A', u'""funky" style"', u'test.'] 
     246    >>> list(smart_split(' _("my quoted string") ')) 
     247    [u'_("my quoted string")'] 
     248    >>> list(smart_split(" _('my quoted string') ")) 
     249    [u"_('my quoted string')"] 
    214250    """ 
    215251    text = force_unicode(text) 
    216252    for bit in smart_split_re.finditer(text): 
    217253        bit = bit.group(0) 
    218         if bit[0] == '"' and bit[-1] == '"': 
    219             yield '"' + bit[1:-1].replace('\\"', '"').replace('\\\\', '\\') + '"' 
    220         elif bit[0] == "'" and bit[-1] == "'": 
    221             yield "'" + bit[1:-1].replace("\\'", "'").replace("\\\\", "\\") + "'" 
     254        prefix, suffix = '', '' 
     255        start, end = 1, -1 
     256        if bit[0:2] == '_(' and bit[-1] == ')': 
     257            prefix, suffix = '_(', ')' 
     258            start, end = 3, -2 
     259        if (bit[0] == '"' and bit[-1] == '"' 
     260                or bit[0:3] == '_("' and bit[-2:] == '")'): 
     261            yield '%s"%s"%s' % (prefix, 
     262                    bit[start:end].replace(r'\"', '"').replace(r'\\', '\\'), 
     263                    suffix) 
     264        elif (bit[0] == "'" and bit[-1] == "'" 
     265                or bit[0:3] == "_('" and bit[-2:] == "')"): 
     266            yield "%s'%s'%s" % (prefix, 
     267                    bit[start:end].replace(r"\'", "'").replace(r'\\', '\\'), 
     268                    suffix) 
    222269        else: 
    223270            yield bit 
    224271smart_split = allow_lazy(smart_split, unicode) 
  • tests/regressiontests/text/tests.py

    old new  
    1515[u'"a', u"'one"] 
    1616>>> print list(smart_split(r'''all friends' tests'''))[1] 
    1717friends' 
     18>>> list(smart_split(' _("my quoted string") ')) 
     19[u'_("my quoted string")'] 
     20>>> list(smart_split(" _('my quoted string') ")) 
     21[u"_('my quoted string')"] 
     22>>> print list(smart_split(" _('my \"quoted\" string') "))[0] 
     23_('my "quoted" string') 
    1824 
    1925### urlquote ############################################################# 
    2026>>> from django.utils.http import urlquote, urlquote_plus