Code

Ticket #5253: csv_serializer.diff

File csv_serializer.diff, 11.5 KB (added by Adam Schmideg <adam@…>, 7 years ago)

An implementation that works with serializers_regress test ready for inclusion

Line 
1Index: django/core/serializers/__init__.py
2===================================================================
3--- django/core/serializers/__init__.py (revision 6031)
4+++ django/core/serializers/__init__.py (working copy)
5@@ -23,6 +23,7 @@
6     "xml"    : "django.core.serializers.xml_serializer",
7     "python" : "django.core.serializers.python",
8     "json"   : "django.core.serializers.json",
9+    "csv"    : "django.core.serializers.csv_serializer",
10 }
11 
12 # Check for PyYaml and register the serializer if it's available.
13Index: django/core/serializers/csv_serializer.py
14===================================================================
15--- django/core/serializers/csv_serializer.py   (revision 0)
16+++ django/core/serializers/csv_serializer.py   (revision 0)
17@@ -0,0 +1,283 @@
18+"""
19+Serialize multiple table data to and from a single csv stream, using the
20+standard csv module.
21+
22+The format of csv is sort of standardized in rfc4180, stating that there
23+are more implementations, even incompatible ones.  It treats headers as
24+optional where column names are separated the same way as field values.
25+It leaves some important questions open,
26+ - how to handle null values as opposed to empty strings,
27+ - how to handle relations, such as foreign keys or many-to-many
28+   relations,
29+ - how to represent multiple tables in a single csv file.
30+
31+The latter issue is addressed in Creativyst's ctx format at
32+http://www.creativyst.com/Doc/Std/ctx/ctx.htm where a multiple-line
33+header is used to describe metadata.  I didn't want to use their
34+approach because it conflicts with existing csv tools (such as the
35+python csv module) for simpler cases.
36+
37+Let's start with an example what csv this module produces and
38+understands.
39+
40+news_author:registration_number,name
41+555001,Jack
42+555002,Jill
43+
44+news_article:id,authors,title,text,published
45+1,"[1, 2]",On Life And Chees,Once upon a time...,2001-05-30
46+2,[2],,I should write this,
47+
48+Here is a summary of how values are represented.
49+ - Tables are separated by two lineterminators because it's not
50+   intrusive and gives a good visual guidance.  It's simply parsed as an
51+   empty line by csv tools, preserving the structure.  A single csv file
52+   is also easy to split by the separator using csplit for example.
53+ - Headers are mandatory, containing the column names separated by
54+   commas.
55+ - The first header field is special, it has the form '<table name>:<pk
56+   name>'.  This doesn't conflict with other parsers; and the colon as
57+   separator is widely used in the Unix world and it cannot be part of
58+   the table or column name.  The usage of <pk name> instead of just
59+   'pk' is intentional, although it differs from the constant usage of
60+   'pk' is the json an xml serializers modules -- this is how database
61+   dumps work, for example in sqlite.
62+ - None is represented as an empty string.
63+ - Foreign keys are represented as integers.
64+ - Many-to-many relations are represented as a list of foreign keys.
65+ - Strings are represented as they are except for strings that contain
66+   only zero or more spaces.
67+ - Strings of only zero or more spaces are prepended an extra leading
68+   space, so '' becomes ' ', ' ' becomes '  ', etc.  This may look
69+   strange first but this is how None (represented as '') and ''
70+   (represented as ' ') are distinguished.  Space-only strings are a
71+   rare beast, leading and trailing spaces are also frequently trimmed
72+   by csv parsers, so I find this a fair compromise.
73+"""
74+import codecs
75+import csv
76+try:
77+    from cStringIO import StringIO
78+except ImportError:
79+    from StringIO import StringIO
80+import os
81+import re
82+
83+from django.core.serializers import base
84+from django.db import models
85+# These fields should all extend CharField since they all work with
86+# string data
87+from django.db.models.fields import CharField, FileField, \
88+  FilePathField, SlugField, TextField, USStateField
89+
90+
91+spaces_re = re.compile('^[ ]*$')
92+
93+class Serializer(base.Serializer):
94+    "Serialize to csv"
95+
96+    def start_serialization(self):
97+        self.last_model = None
98+        # By default, csv module uses '\r\n' as lineterminator
99+        self.output = UnicodeWriter(self.stream, lineterminator=os.linesep)
100+
101+    def start_object(self, obj):
102+        if not hasattr(obj, "_meta"):
103+            raise base.SerializationError("Non-model object (%s) encountered during serialization" % type(obj))
104+        if self.last_model != obj._meta:
105+            meta = obj._meta
106+            self.last_model = meta
107+            fields = self.selected_fields
108+            if fields:
109+                fields = list(fields)
110+            else:
111+                fields = \
112+                    [field.name for field in meta.fields] + \
113+                    [field.name for field in meta.many_to_many]
114+            if meta.pk.attname in fields:
115+                fields.remove(meta.pk.attname)
116+            header = ['%s:%s' % (meta, meta.pk.attname)]
117+            for field_name in fields:
118+                header.append(field_name)
119+            # Table separator is an empty row
120+            self.output.writerow([])
121+            self.output.writerow(header)
122+        self.row = [str(obj._get_pk_val())]
123+
124+    def end_object(self, obj):
125+        self.output.writerow(self.row)
126+
127+    def handle_field(self, obj, field):
128+        self.row.append(self.get_string_value(obj, field))
129+
130+    def handle_fk_field(self, obj, field):
131+        related = getattr(obj, field.name)
132+        if related is None:
133+            repr = ''
134+        else:
135+            if field.rel.field_name == related._meta.pk.name:
136+                # relation via pk
137+                repr = str(related._get_pk_val())
138+            else:
139+                # relation via other field
140+                repr = str(getattr(related, field.rel.field_name))
141+        self.row.append(repr)
142+
143+    def handle_m2m_field(self, obj, field):
144+        """Represented as a tuple of related ids, or empty string of there
145+        are no related objects"""
146+        related = [related._get_pk_val() for related in getattr(obj, field.name).iterator()]
147+        if related:
148+            self.row.append(str(related))
149+        else:
150+            self.row.append('')
151+
152+    def get_string_value(self, obj, field):
153+        """
154+        None always becomes ''.  For string values prepend a leading
155+        space if the string contains only spaces so '' becomes ' ' and '
156+        ' becomes '  ', etc.  Other values are handled normally.
157+        """
158+        value = getattr(obj, field.name)
159+        if value is None:
160+            return ''
161+        elif is_string_field(field):
162+            if spaces_re.match(value):
163+                return ' ' + value
164+            else:
165+                return value
166+        else:
167+            return super(Serializer, self).get_string_value(obj, field)
168+
169+
170+class Deserializer(base.Deserializer):
171+    "Deserialize from csv"
172+
173+    def __init__(self, stream_or_string, **options):
174+        super(Deserializer, self).__init__(stream_or_string, **options)
175+        self.next = self.__iter__().next
176+
177+    def __iter__(self):
178+        header_coming = True
179+        for values in UnicodeReader(self.stream):
180+            if not values:
181+                header_coming = True
182+            else:
183+                if header_coming:
184+                    # Model
185+                    model, first_field = values[0].split(':', 2)
186+                    try:
187+                        self.model = models.get_model(*model.split("."))
188+                    except TypeError:
189+                        raise base.DeserializationError("No model %s in db" % model)
190+                    # Field names
191+                    self.field_names = [first_field] + values[1:]
192+                    header_coming = False
193+                else:
194+                    # An object
195+                    meta = self.model._meta
196+                    data = {meta.pk.attname: meta.pk.to_python(values[0])}
197+                    m2m_data = {}
198+                    for i in range(1, len(values)):
199+                        name = self.field_names[i]
200+                        value = values[i]
201+                        field = meta.get_field(name)
202+                        if field.rel and isinstance(field.rel, models.ManyToManyRel):
203+                            m2m_data[field.name] = self.handle_m2m_field(value, field)
204+                        elif field.rel and isinstance(field.rel, models.ManyToOneRel):
205+                            data[field.attname] = self.handle_fk_field(value, field)
206+                        else:
207+                            data[field.name] = self.handle_field(value, field)
208+                    yield base.DeserializedObject(self.model(**data), m2m_data)
209+
210+    def handle_field(self, raw, field):
211+        if raw == '':
212+            raw = None
213+        elif is_string_field(field):
214+            if spaces_re.match(raw):
215+                raw = raw[1:]
216+        return field.to_python(raw)
217+
218+    def handle_fk_field(self, raw, field):
219+        if raw == '':
220+            return None
221+        related_field = field.rel.to._meta.get_field(field.rel.field_name)
222+        return related_field.to_python(raw)
223+
224+    def handle_m2m_field(self, raw, field):
225+        if raw:
226+            return eval(raw)
227+        else:
228+            return []
229+
230+
231+def is_string_field(field):
232+    """If all field classes working with strings extended CharField, we
233+    wouldn't need this method"""
234+    return bool(isinstance(field,
235+        (CharField, FileField, FilePathField, SlugField, TextField,
236+        USStateField)))
237+
238+
239+# Copied from csv module examples with some modifications
240+# - getincrementalencoder replaced with getencoder because it works with
241+# python < 2.5
242+
243+class UTF8Recoder:
244+    """
245+    Iterator that reads an encoded stream and reencodes the input to UTF-8
246+    """
247+    def __init__(self, f, encoding):
248+        self.reader = codecs.getreader(encoding)(f)
249+
250+    def __iter__(self):
251+        return self
252+
253+    def next(self):
254+        return self.reader.next().encode("utf-8")
255+
256+class UnicodeReader:
257+    """
258+    A CSV reader which will iterate over lines in the CSV file "f",
259+    which is encoded in the given encoding.
260+    """
261+
262+    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
263+        f = UTF8Recoder(f, encoding)
264+        self.reader = csv.reader(f, dialect=dialect, **kwds)
265+
266+    def next(self):
267+        row = self.reader.next()
268+        return [unicode(s, "utf-8") for s in row]
269+
270+    def __iter__(self):
271+        return self
272+
273+class UnicodeWriter:
274+    """
275+    A CSV writer which will write rows to CSV file "f",
276+    which is encoded in the given encoding.
277+    """
278+
279+    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
280+        # Redirect output to a queue
281+        self.queue = StringIO()
282+        self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
283+        self.stream = f
284+        self.encode = codecs.getencoder(encoding)
285+
286+    def writerow(self, row):
287+        self.writer.writerow([s.encode("utf-8") for s in row])
288+        # Fetch UTF-8 output from the queue ...
289+        data = self.queue.getvalue()
290+        data = data.decode("utf-8")
291+        # ... and reencode it into the target encoding
292+        data = self.encode(data)[0]
293+        # write to the target stream
294+        self.stream.write(data)
295+        # empty queue
296+        self.queue.truncate(0)
297+
298+    def writerows(self, rows):
299+        for row in rows:
300+            self.writerow(row)