Code

Ticket #5253: csv_serializer-0.96.3.diff

File csv_serializer-0.96.3.diff, 12.1 KB (added by erob, 5 years ago)
Line 
1diff -r c3ff2697c472 django/core/serializers/__init__.py
2--- a/django/core/serializers/__init__.py       Mon Apr 20 15:09:25 2009 -0400
3+++ b/django/core/serializers/__init__.py       Tue Apr 21 14:20:57 2009 -0400
4@@ -23,6 +23,7 @@
5     "xml"    : "django.core.serializers.xml_serializer",
6     "python" : "django.core.serializers.python",
7     "json"   : "django.core.serializers.json",
8+    "csv"    : "django.core.serializers.csv_serializer"
9 }
10 
11 # Check for PyYaml and register the serializer if it's available.
12@@ -87,4 +88,4 @@
13         register_serializer(format, BUILTIN_SERIALIZERS[format])
14     if hasattr(settings, "SERIALIZATION_MODULES"):
15         for format in settings.SERIALIZATION_MODULES:
16-            register_serializer(format, settings.SERIALIZATION_MODULES[format])
17\ No newline at end of file
18+            register_serializer(format, settings.SERIALIZATION_MODULES[format])
19diff -r c3ff2697c472 django/core/serializers/csv_serializer.py
20--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
21+++ b/django/core/serializers/csv_serializer.py Tue Apr 21 14:20:57 2009 -0400
22@@ -0,0 +1,298 @@
23+"""
24+Serialize multiple table data to and from a single csv stream, using the
25+standard csv module.
26+
27+The format of csv is sort of standardized in rfc4180, stating that there
28+are more implementations, even incompatible ones.  It treats headers as
29+optional where column names are separated the same way as field values.
30+It leaves some important questions open,
31+ - how to handle null values as opposed to empty strings,
32+ - how to handle relations, such as foreign keys or many-to-many
33+   relations,
34+ - how to represent multiple tables in a single csv file.
35+
36+The latter issue is addressed in Creativyst's ctx format at
37+http://www.creativyst.com/Doc/Std/ctx/ctx.htm where a multiple-line
38+header is used to describe metadata.  I didn't want to use their
39+approach because it conflicts with existing csv tools (such as the
40+python csv module) for simpler cases.
41+
42+Let's start with an example what csv this module produces and
43+understands.
44+
45+news_author:registration_number,name
46+555001,Jack
47+555002,Jill
48+
49+news_article:id,authors,title,text,published
50+1,"[1, 2]",On Life And Chees,Once upon a time...,2001-05-30
51+2,[2],,I should write this,
52+
53+Here is a summary of how values are represented.
54+ - Tables are separated by two lineterminators because it's not
55+   intrusive and gives a good visual guidance.  It's simply parsed as an
56+   empty line by csv tools, preserving the structure.  A single csv file
57+   is also easy to split by the separator using csplit for example.
58+ - Headers are mandatory, containing the column names separated by
59+   commas.
60+ - The first header field is special, it has the form '<table name>:<pk
61+   name>'.  This doesn't conflict with other parsers; and the colon as
62+   separator is widely used in the Unix world and it cannot be part of
63+   the table or column name.  The usage of <pk name> instead of just
64+   'pk' is intentional, although it differs from the constant usage of
65+   'pk' is the json an xml serializers modules -- this is how database
66+   dumps work, for example in sqlite.
67+ - None is represented as an empty string.
68+ - Foreign keys are represented as integers.
69+ - Many-to-many relations are represented as a list of foreign keys.
70+ - Strings are represented as they are except for strings that contain
71+   only zero or more spaces.
72+ - Strings of only zero or more spaces are prepended an extra leading
73+   space, so '' becomes ' ', ' ' becomes '  ', etc.  This may look
74+   strange first but this is how None (represented as '') and ''
75+   (represented as ' ') are distinguished.  Space-only strings are a
76+   rare beast, leading and trailing spaces are also frequently trimmed
77+   by csv parsers, so I find this a fair compromise.
78+"""
79+import codecs
80+import csv
81+try:
82+    from cStringIO import StringIO
83+except ImportError:
84+    from StringIO import StringIO
85+import os
86+import re
87+
88+from django.core.serializers import base
89+from django.db import models
90+# These fields should all extend CharField since they all work with
91+# string data
92+from django.db.models.fields import CharField, FilePathField, SlugField, TextField
93+
94+# FileField and USStateField are only available in Django 1.0.X
95+#from django.db.models.fields.files import FileField
96+#from django.contrib.localflavor.us.models import USStateField
97+
98+spaces_re = re.compile('^[ ]*$')
99+
100+class Serializer(base.Serializer):
101+    "Serialize to csv"
102+
103+    def start_serialization(self):
104+        self.last_model = None
105+        # By default, csv module uses '\r\n' as lineterminator
106+        self.output = UnicodeWriter(self.stream, lineterminator=os.linesep)
107+
108+    def start_object(self, obj):
109+        if not hasattr(obj, "_meta"):
110+            raise base.SerializationError("Non-model object (%s) encountered during serialization" % type(obj))
111+        if self.last_model != obj._meta:
112+            meta = obj._meta
113+            self.last_model = meta
114+            fields = self.selected_fields
115+            if fields:
116+                fields = list(fields)
117+            else:
118+                fields = \
119+                    [field.name for field in meta.fields] + \
120+                    [field.name for field in meta.many_to_many]
121+            if meta.pk.attname in fields:
122+                fields.remove(meta.pk.attname)
123+            header = ['%s:%s' % (meta, meta.pk.attname)]
124+            for field_name in fields:
125+                header.append(field_name)
126+            # Table separator is an empty row
127+            self.output.writerow([])
128+            self.output.writerow(header)
129+        self.row = [str(obj._get_pk_val())]
130+
131+    def end_object(self, obj):
132+        self.output.writerow(self.row)
133+
134+    def handle_field(self, obj, field):
135+        self.row.append(self.get_string_value(obj, field))
136+
137+    def handle_fk_field(self, obj, field):
138+        related = getattr(obj, field.name)
139+        if related is None:
140+            repr = ''
141+        else:
142+            if field.rel.field_name == related._meta.pk.name:
143+                # relation via pk
144+                repr = str(related._get_pk_val())
145+            else:
146+                # relation via other field
147+                repr = str(getattr(related, field.rel.field_name))
148+        self.row.append(repr)
149+
150+    def handle_m2m_field(self, obj, field):
151+        """Represented as a tuple of related ids, or empty string of there
152+        are no related objects"""
153+        related = [related._get_pk_val() for related in getattr(obj, field.name).iterator()]
154+        if related:
155+            self.row.append(str(related))
156+        else:
157+            self.row.append('')
158+
159+    def get_string_value(self, obj, field):
160+        """
161+        None always becomes ''.  For string values prepend a leading
162+        space if the string contains only spaces so '' becomes ' ' and '
163+        ' becomes '  ', etc.  Other values are handled normally.
164+        """
165+        value = getattr(obj, field.name)
166+        if value is None:
167+            return ''
168+        elif is_string_field(field):
169+            if spaces_re.match(value):
170+                return ' ' + value
171+            else:
172+                return value
173+        else:
174+            return super(Serializer, self).get_string_value(obj, field)
175+
176+
177+class Deserializer(base.Deserializer):
178+    "Deserialize from csv"
179+
180+    def __init__(self, stream_or_string, **options):
181+        super(Deserializer, self).__init__(stream_or_string, **options)
182+        self.next = self.__iter__().next
183+
184+    def __iter__(self):
185+        header_coming = True
186+        for values in UnicodeReader(self.stream):
187+            if not values:
188+                header_coming = True
189+            else:
190+                if header_coming:
191+                    # Model
192+                    model, first_field = values[0].split(':', 2)
193+                    try:
194+                        self.model = models.get_model(*model.split("."))
195+                    except TypeError:
196+                        raise base.DeserializationError("No model %s in db" % model)
197+                    # Field names
198+                    self.field_names = [first_field] + values[1:]
199+                    header_coming = False
200+                else:
201+                    # An object
202+                    meta = self.model._meta
203+                    data = {meta.pk.attname: meta.pk.to_python(values[0])}
204+                    m2m_data = {}
205+                    for i in range(1, len(values)):
206+                        name = self.field_names[i]
207+                        value = values[i]
208+                        field = meta.get_field(name)
209+                        if field.rel and isinstance(field.rel, models.ManyToManyRel):
210+                            m2m_data[field.name] = self.handle_m2m_field(value, field)
211+                        elif field.rel and isinstance(field.rel, models.ManyToOneRel):
212+                            data[field.attname] = self.handle_fk_field(value, field)
213+                        else:
214+                            data[field.name] = self.handle_field(value, field)
215+                    yield base.DeserializedObject(self.model(**data), m2m_data)
216+
217+    def handle_field(self, raw, field):
218+        if raw == '':
219+            raw = None
220+        elif is_string_field(field):
221+            if spaces_re.match(raw):
222+                raw = raw[1:]
223+        return field.to_python(raw)
224+
225+    def handle_fk_field(self, raw, field):
226+        if raw == '':
227+            return None
228+        related_field = field.rel.to._meta.get_field(field.rel.field_name)
229+        return related_field.to_python(raw)
230+
231+    def handle_m2m_field(self, raw, field):
232+        if raw:
233+            return eval(raw)
234+        else:
235+            return []
236+
237+
238+def is_string_field(field):
239+    """If all field classes working with strings extended CharField, we
240+    wouldn't need this method"""
241+
242+    string_types = ('CharField', 'FileField', 'FilePathField', 'SlugField',
243+        'TextField', 'USStateField')
244+   
245+    for s in string_types:
246+        if field.__class__ == s:
247+            #print "%s is of type %s" % (field, s)
248+            return True
249+    return False   
250+
251+    #return bool(isinstance(field,
252+    #    (CharField, FileField, FilePathField, SlugField, TextField,
253+    #    USStateField)))
254+    return NotImplementedError
255+
256+
257+
258+# Copied from csv module examples with some modifications
259+# - getincrementalencoder replaced with getencoder because it works with
260+# python < 2.5
261+
262+class UTF8Recoder:
263+    """
264+    Iterator that reads an encoded stream and reencodes the input to UTF-8
265+    """
266+    def __init__(self, f, encoding):
267+        self.reader = codecs.getreader(encoding)(f)
268+
269+    def __iter__(self):
270+        return self
271+
272+    def next(self):
273+        return self.reader.next().encode("utf-8")
274+
275+class UnicodeReader:
276+    """
277+    A CSV reader which will iterate over lines in the CSV file "f",
278+    which is encoded in the given encoding.
279+    """
280+
281+    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
282+        f = UTF8Recoder(f, encoding)
283+        self.reader = csv.reader(f, dialect=dialect, **kwds)
284+
285+    def next(self):
286+        row = self.reader.next()
287+        return [unicode(s, "utf-8") for s in row]
288+
289+    def __iter__(self):
290+        return self
291+
292+class UnicodeWriter:
293+    """
294+    A CSV writer which will write rows to CSV file "f",
295+    which is encoded in the given encoding.
296+    """
297+
298+    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
299+        # Redirect output to a queue
300+        self.queue = StringIO()
301+        self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
302+        self.stream = f
303+        self.encode = codecs.getencoder(encoding)
304+
305+    def writerow(self, row):
306+        #self.writer.writerow([s.encode("utf-8") for s in row])
307+        self.writer.writerow([s for s in row])
308+        # Fetch UTF-8 output from the queue ...
309+        data = self.queue.getvalue()
310+        data = data.decode("utf-8")
311+        # ... and reencode it into the target encoding
312+        data = self.encode(data)[0]
313+        # write to the target stream
314+        self.stream.write(data)
315+        # empty queue
316+        self.queue.truncate(0)
317+
318+    def writerows(self, rows):
319+        for row in rows:
320+            self.writerow(row)