﻿id	summary	reporter	owner	description	type	status	component	version	severity	resolution	keywords	cc	stage	has_patch	needs_docs	needs_tests	needs_better_patch	easy	ui_ux
2543	[patch] Django doesn't handle UTF-8 encoded URLs properly	Victor Ng <crankycoder@…>	hugo	"RFC 2718 and RFC 3986 specify that URLs for non-english characters should be encoded in UTF-8 and unsafe characters should be encoded in %HH format.

Django should decode paths assuming that they are UTF8 data and make them unicode strings.  

Here's a patch including a single line addition to the unit test suite to decode paths into unicode data and to enable the re.UNICODE flag on regular expression matching in the urlpatterns.

{{{
Index: django/core/urlresolvers.py
===================================================================
--- django/core/urlresolvers.py	(revision 3582)
+++ django/core/urlresolvers.py	(working copy)
@@ -79,9 +79,10 @@
             test_regex = grouped
         # Note we're using re.match here on purpose because the start of
         # to string needs to match.
-        if not re.match(test_regex + '$', str(value)): # TODO: Unicode?
+        match_objs = re.match(""("" + test_regex + ')$', unicode(value), re.UNICODE)
+        if not match_objs:
             raise NoReverseMatch(""Value %r didn't match regular expression %r"" % (value, test_regex))
-        return str(value) # TODO: Unicode?
+        return match_objs.group(0)
 
 class RegexURLPattern(object):
     def __init__(self, regex, callback, default_args=None):
@@ -89,7 +90,7 @@
         # callback is either a string like 'foo.views.news.stories.story_detail'
         # which represents the path to a module and a view function name, or a
         # callable object (view).
-        self.regex = re.compile(regex)
+        self.regex = re.compile(regex, re.UNICODE)
         if callable(callback):
             self._callback = callback
         else:
@@ -143,7 +144,7 @@
     def __init__(self, regex, urlconf_name, default_kwargs=None):
         # regex is a string representing a regular expression.
         # urlconf_name is a string representing the module containing urlconfs.
-        self.regex = re.compile(regex)
+        self.regex = re.compile(regex, re.UNICODE)
         self.urlconf_name = urlconf_name
         self.callback = None
         self.default_kwargs = default_kwargs or {}
@@ -225,6 +226,7 @@
         from django.conf import settings
         urlconf = settings.ROOT_URLCONF
     resolver = RegexURLResolver(r'^/', urlconf)
+
     return resolver.resolve(path)
 
 def reverse(viewname, urlconf=None, args=None, kwargs=None):
Index: django/core/handlers/base.py
===================================================================
--- django/core/handlers/base.py	(revision 3582)
+++ django/core/handlers/base.py	(working copy)
@@ -60,6 +60,9 @@
             if response:
                 return response
 
+        # URLs are encoded in UTF8 (RFC3986, RFC2718)
+        path = path.decode('utf8')
+
         resolver = urlresolvers.RegexURLResolver(r'^/', settings.ROOT_URLCONF)
         try:
             callback, callback_args, callback_kwargs = resolver.resolve(path)
Index: tests/othertests/urlpatterns_reverse.py
===================================================================
--- tests/othertests/urlpatterns_reverse.py	(revision 3582)
+++ tests/othertests/urlpatterns_reverse.py	(working copy)
@@ -23,6 +23,8 @@
     ('^people/(?P<state>\w\w)/(?P<name>\w+)/$', NoReverseMatch, [], {'name': 'adrian'}),
     ('^people/(?P<state>\w\w)/(\w+)/$', NoReverseMatch, ['il'], {'name': 'adrian'}),
     ('^people/(?P<state>\w\w)/(\w+)/$', 'people/il/adrian/', ['adrian'], {'state': 'il'}),
+    ('^people/(?P<state>\w\w)/(\w+)/$', 'people/il/adrian/', ['adrian'], {'state': 'il'}),
+    ('^people/(?P<state>\w\w)/(?P<name>\w+)/$', u'people/il/adria\xd0/', [], {'state': u'il', 'name': u'adria\xd0'}),
 )
 
 def run_tests(verbosity=0):

}}}"	defect	closed	Internationalization		normal	duplicate		victor.ng@…	Unreviewed	1	0	0	0	0	0
