Code

Ticket #5253: csv_serializer_5253.diff

File csv_serializer_5253.diff, 11.3 KB (added by Mnewman, 6 years ago)

version of old (incomplete) patch that works with r9232

Line 
1Index: django/core/serializers/__init__.py
2===================================================================
3--- django/core/serializers/__init__.py (revision 9232)
4+++ django/core/serializers/__init__.py (working copy)
5@@ -23,6 +23,7 @@
6     "xml"    : "django.core.serializers.xml_serializer",
7     "python" : "django.core.serializers.python",
8     "json"   : "django.core.serializers.json",
9+    "csv"    : "django.core.serializers.csv_serializer",
10 }
11 
12 # Check for PyYaml and register the serializer if it's available.
13Index: django/core/serializers/csv_serializer.py
14===================================================================
15--- django/core/serializers/csv_serializer.py   (revision 0)
16+++ django/core/serializers/csv_serializer.py   (revision 0)
17@@ -0,0 +1,284 @@
18+"""
19+Serialize multiple table data to and from a single csv stream, using the
20+standard csv module.
21+
22+The format of csv is sort of standardized in rfc4180, stating that there
23+are more implementations, even incompatible ones.  It treats headers as
24+optional where column names are separated the same way as field values.
25+It leaves some important questions open,
26+ - how to handle null values as opposed to empty strings,
27+ - how to handle relations, such as foreign keys or many-to-many
28+   relations,
29+ - how to represent multiple tables in a single csv file.
30+
31+The latter issue is addressed in Creativyst's ctx format at
32+http://www.creativyst.com/Doc/Std/ctx/ctx.htm where a multiple-line
33+header is used to describe metadata.  I didn't want to use their
34+approach because it conflicts with existing csv tools (such as the
35+python csv module) for simpler cases.
36+
37+Let's start with an example what csv this module produces and
38+understands.
39+
40+news_author:registration_number,name
41+555001,Jack
42+555002,Jill
43+
44+news_article:id,authors,title,text,published
45+1,"[1, 2]",On Life And Chees,Once upon a time...,2001-05-30
46+2,[2],,I should write this,
47+
48+Here is a summary of how values are represented.
49+ - Tables are separated by two lineterminators because it's not
50+   intrusive and gives a good visual guidance.  It's simply parsed as an
51+   empty line by csv tools, preserving the structure.  A single csv file
52+   is also easy to split by the separator using csplit for example.
53+ - Headers are mandatory, containing the column names separated by
54+   commas.
55+ - The first header field is special, it has the form '<table name>:<pk
56+   name>'.  This doesn't conflict with other parsers; and the colon as
57+   separator is widely used in the Unix world and it cannot be part of
58+   the table or column name.  The usage of <pk name> instead of just
59+   'pk' is intentional, although it differs from the constant usage of
60+   'pk' is the json an xml serializers modules -- this is how database
61+   dumps work, for example in sqlite.
62+ - None is represented as an empty string.
63+ - Foreign keys are represented as integers.
64+ - Many-to-many relations are represented as a list of foreign keys.
65+ - Strings are represented as they are except for strings that contain
66+   only zero or more spaces.
67+ - Strings of only zero or more spaces are prepended an extra leading
68+   space, so '' becomes ' ', ' ' becomes '  ', etc.  This may look
69+   strange first but this is how None (represented as '') and ''
70+   (represented as ' ') are distinguished.  Space-only strings are a
71+   rare beast, leading and trailing spaces are also frequently trimmed
72+   by csv parsers, so I find this a fair compromise.
73+"""
74+import codecs
75+import csv
76+try:
77+    from cStringIO import StringIO
78+except ImportError:
79+    from StringIO import StringIO
80+import os
81+import re
82+
83+from django.core.serializers import base
84+from django.db import models
85+# These fields should all extend CharField since they all work with
86+# string data
87+from django.db.models.fields import CharField, FilePathField, SlugField, TextField
88+from django.db.models.fields.files import FileField
89+from django.contrib.localflavor.us.models import USStateField
90+
91+
92+spaces_re = re.compile('^[ ]*$')
93+
94+class Serializer(base.Serializer):
95+    "Serialize to csv"
96+
97+    def start_serialization(self):
98+        self.last_model = None
99+        # By default, csv module uses '\r\n' as lineterminator
100+        self.output = UnicodeWriter(self.stream, lineterminator=os.linesep)
101+
102+    def start_object(self, obj):
103+        if not hasattr(obj, "_meta"):
104+            raise base.SerializationError("Non-model object (%s) encountered during serialization" % type(obj))
105+        if self.last_model != obj._meta:
106+            meta = obj._meta
107+            self.last_model = meta
108+            fields = self.selected_fields
109+            if fields:
110+                fields = list(fields)
111+            else:
112+                fields = \
113+                    [field.name for field in meta.fields] + \
114+                    [field.name for field in meta.many_to_many]
115+            if meta.pk.attname in fields:
116+                fields.remove(meta.pk.attname)
117+            header = ['%s:%s' % (meta, meta.pk.attname)]
118+            for field_name in fields:
119+                header.append(field_name)
120+            # Table separator is an empty row
121+            self.output.writerow([])
122+            self.output.writerow(header)
123+        self.row = [str(obj._get_pk_val())]
124+
125+    def end_object(self, obj):
126+        self.output.writerow(self.row)
127+
128+    def handle_field(self, obj, field):
129+        self.row.append(self.get_string_value(obj, field))
130+
131+    def handle_fk_field(self, obj, field):
132+        related = getattr(obj, field.name)
133+        if related is None:
134+            repr = ''
135+        else:
136+            if field.rel.field_name == related._meta.pk.name:
137+                # relation via pk
138+                repr = str(related._get_pk_val())
139+            else:
140+                # relation via other field
141+                repr = str(getattr(related, field.rel.field_name))
142+        self.row.append(repr)
143+
144+    def handle_m2m_field(self, obj, field):
145+        """Represented as a tuple of related ids, or empty string of there
146+        are no related objects"""
147+        related = [related._get_pk_val() for related in getattr(obj, field.name).iterator()]
148+        if related:
149+            self.row.append(str(related))
150+        else:
151+            self.row.append('')
152+
153+    def get_string_value(self, obj, field):
154+        """
155+        None always becomes ''.  For string values prepend a leading
156+        space if the string contains only spaces so '' becomes ' ' and '
157+        ' becomes '  ', etc.  Other values are handled normally.
158+        """
159+        value = getattr(obj, field.name)
160+        if value is None:
161+            return ''
162+        elif is_string_field(field):
163+            if spaces_re.match(value):
164+                return ' ' + value
165+            else:
166+                return value
167+        else:
168+            return super(Serializer, self).get_string_value(obj, field)
169+
170+
171+class Deserializer(base.Deserializer):
172+    "Deserialize from csv"
173+
174+    def __init__(self, stream_or_string, **options):
175+        super(Deserializer, self).__init__(stream_or_string, **options)
176+        self.next = self.__iter__().next
177+
178+    def __iter__(self):
179+        header_coming = True
180+        for values in UnicodeReader(self.stream):
181+            if not values:
182+                header_coming = True
183+            else:
184+                if header_coming:
185+                    # Model
186+                    model, first_field = values[0].split(':', 2)
187+                    try:
188+                        self.model = models.get_model(*model.split("."))
189+                    except TypeError:
190+                        raise base.DeserializationError("No model %s in db" % model)
191+                    # Field names
192+                    self.field_names = [first_field] + values[1:]
193+                    header_coming = False
194+                else:
195+                    # An object
196+                    meta = self.model._meta
197+                    data = {meta.pk.attname: meta.pk.to_python(values[0])}
198+                    m2m_data = {}
199+                    for i in range(1, len(values)):
200+                        name = self.field_names[i]
201+                        value = values[i]
202+                        field = meta.get_field(name)
203+                        if field.rel and isinstance(field.rel, models.ManyToManyRel):
204+                            m2m_data[field.name] = self.handle_m2m_field(value, field)
205+                        elif field.rel and isinstance(field.rel, models.ManyToOneRel):
206+                            data[field.attname] = self.handle_fk_field(value, field)
207+                        else:
208+                            data[field.name] = self.handle_field(value, field)
209+                    yield base.DeserializedObject(self.model(**data), m2m_data)
210+
211+    def handle_field(self, raw, field):
212+        if raw == '':
213+            raw = None
214+        elif is_string_field(field):
215+            if spaces_re.match(raw):
216+                raw = raw[1:]
217+        return field.to_python(raw)
218+
219+    def handle_fk_field(self, raw, field):
220+        if raw == '':
221+            return None
222+        related_field = field.rel.to._meta.get_field(field.rel.field_name)
223+        return related_field.to_python(raw)
224+
225+    def handle_m2m_field(self, raw, field):
226+        if raw:
227+            return eval(raw)
228+        else:
229+            return []
230+
231+
232+def is_string_field(field):
233+    """If all field classes working with strings extended CharField, we
234+    wouldn't need this method"""
235+    return bool(isinstance(field,
236+        (CharField, FileField, FilePathField, SlugField, TextField,
237+        USStateField)))
238+
239+
240+# Copied from csv module examples with some modifications
241+# - getincrementalencoder replaced with getencoder because it works with
242+# python < 2.5
243+
244+class UTF8Recoder:
245+    """
246+    Iterator that reads an encoded stream and reencodes the input to UTF-8
247+    """
248+    def __init__(self, f, encoding):
249+        self.reader = codecs.getreader(encoding)(f)
250+
251+    def __iter__(self):
252+        return self
253+
254+    def next(self):
255+        return self.reader.next().encode("utf-8")
256+
257+class UnicodeReader:
258+    """
259+    A CSV reader which will iterate over lines in the CSV file "f",
260+    which is encoded in the given encoding.
261+    """
262+
263+    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
264+        f = UTF8Recoder(f, encoding)
265+        self.reader = csv.reader(f, dialect=dialect, **kwds)
266+
267+    def next(self):
268+        row = self.reader.next()
269+        return [unicode(s, "utf-8") for s in row]
270+
271+    def __iter__(self):
272+        return self
273+
274+class UnicodeWriter:
275+    """
276+    A CSV writer which will write rows to CSV file "f",
277+    which is encoded in the given encoding.
278+    """
279+
280+    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
281+        # Redirect output to a queue
282+        self.queue = StringIO()
283+        self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
284+        self.stream = f
285+        self.encode = codecs.getencoder(encoding)
286+
287+    def writerow(self, row):
288+        self.writer.writerow([s.encode("utf-8") for s in row])
289+        # Fetch UTF-8 output from the queue ...
290+        data = self.queue.getvalue()
291+        data = data.decode("utf-8")
292+        # ... and reencode it into the target encoding
293+        data = self.encode(data)[0]
294+        # write to the target stream
295+        self.stream.write(data)
296+        # empty queue
297+        self.queue.truncate(0)
298+
299+    def writerows(self, rows):
300+        for row in rows:
301+            self.writerow(row)