| | 1 | """ |
| | 2 | Serialize multiple table data to and from a single csv stream, using the |
| | 3 | standard csv module. |
| | 4 | |
| | 5 | The format of csv is sort of standardized in rfc4180, stating that there |
| | 6 | are more implementations, even incompatible ones. It treats headers as |
| | 7 | optional where column names are separated the same way as field values. |
| | 8 | It leaves some important questions open, |
| | 9 | - how to handle null values as opposed to empty strings, |
| | 10 | - how to handle relations, such as foreign keys or many-to-many |
| | 11 | relations, |
| | 12 | - how to represent multiple tables in a single csv file. |
| | 13 | |
| | 14 | The latter issue is addressed in Creativyst's ctx format at |
| | 15 | http://www.creativyst.com/Doc/Std/ctx/ctx.htm where a multiple-line |
| | 16 | header is used to describe metadata. I didn't want to use their |
| | 17 | approach because it conflicts with existing csv tools (such as the |
| | 18 | python csv module) for simpler cases. |
| | 19 | |
| | 20 | Let's start with an example what csv this module produces and |
| | 21 | understands. |
| | 22 | |
| | 23 | news_author:registration_number,name |
| | 24 | 555001,Jack |
| | 25 | 555002,Jill |
| | 26 | |
| | 27 | news_article:id,authors,title,text,published |
| | 28 | 1,"[1, 2]",On Life And Chees,Once upon a time...,2001-05-30 |
| | 29 | 2,[2],,I should write this, |
| | 30 | |
| | 31 | Here is a summary of how values are represented. |
| | 32 | - Tables are separated by two lineterminators because it's not |
| | 33 | intrusive and gives a good visual guidance. It's simply parsed as an |
| | 34 | empty line by csv tools, preserving the structure. A single csv file |
| | 35 | is also easy to split by the separator using csplit for example. |
| | 36 | - Headers are mandatory, containing the column names separated by |
| | 37 | commas. |
| | 38 | - The first header field is special, it has the form '<table name>:<pk |
| | 39 | name>'. This doesn't conflict with other parsers; and the colon as |
| | 40 | separator is widely used in the Unix world and it cannot be part of |
| | 41 | the table or column name. The usage of <pk name> instead of just |
| | 42 | 'pk' is intentional, although it differs from the constant usage of |
| | 43 | 'pk' is the json an xml serializers modules -- this is how database |
| | 44 | dumps work, for example in sqlite. |
| | 45 | - None is represented as an empty string. |
| | 46 | - Foreign keys are represented as integers. |
| | 47 | - Many-to-many relations are represented as a list of foreign keys. |
| | 48 | - Strings are represented as they are except for strings that contain |
| | 49 | only zero or more spaces. |
| | 50 | - Strings of only zero or more spaces are prepended an extra leading |
| | 51 | space, so '' becomes ' ', ' ' becomes ' ', etc. This may look |
| | 52 | strange first but this is how None (represented as '') and '' |
| | 53 | (represented as ' ') are distinguished. Space-only strings are a |
| | 54 | rare beast, leading and trailing spaces are also frequently trimmed |
| | 55 | by csv parsers, so I find this a fair compromise. |
| | 56 | """ |
| | 57 | import codecs |
| | 58 | import csv |
| | 59 | try: |
| | 60 | from cStringIO import StringIO |
| | 61 | except ImportError: |
| | 62 | from StringIO import StringIO |
| | 63 | import os |
| | 64 | import re |
| | 65 | |
| | 66 | from django.core.serializers import base |
| | 67 | from django.db import models |
| | 68 | # These fields should all extend CharField since they all work with |
| | 69 | # string data |
| | 70 | from django.db.models.fields import CharField, FilePathField, SlugField, TextField |
| | 71 | |
| | 72 | # FileField and USStateField are only available in Django 1.0.X |
| | 73 | #from django.db.models.fields.files import FileField |
| | 74 | #from django.contrib.localflavor.us.models import USStateField |
| | 75 | |
| | 76 | spaces_re = re.compile('^[ ]*$') |
| | 77 | |
| | 78 | class Serializer(base.Serializer): |
| | 79 | "Serialize to csv" |
| | 80 | |
| | 81 | def start_serialization(self): |
| | 82 | self.last_model = None |
| | 83 | # By default, csv module uses '\r\n' as lineterminator |
| | 84 | self.output = UnicodeWriter(self.stream, lineterminator=os.linesep) |
| | 85 | |
| | 86 | def start_object(self, obj): |
| | 87 | if not hasattr(obj, "_meta"): |
| | 88 | raise base.SerializationError("Non-model object (%s) encountered during serialization" % type(obj)) |
| | 89 | if self.last_model != obj._meta: |
| | 90 | meta = obj._meta |
| | 91 | self.last_model = meta |
| | 92 | fields = self.selected_fields |
| | 93 | if fields: |
| | 94 | fields = list(fields) |
| | 95 | else: |
| | 96 | fields = \ |
| | 97 | [field.name for field in meta.fields] + \ |
| | 98 | [field.name for field in meta.many_to_many] |
| | 99 | if meta.pk.attname in fields: |
| | 100 | fields.remove(meta.pk.attname) |
| | 101 | header = ['%s:%s' % (meta, meta.pk.attname)] |
| | 102 | for field_name in fields: |
| | 103 | header.append(field_name) |
| | 104 | # Table separator is an empty row |
| | 105 | self.output.writerow([]) |
| | 106 | self.output.writerow(header) |
| | 107 | self.row = [str(obj._get_pk_val())] |
| | 108 | |
| | 109 | def end_object(self, obj): |
| | 110 | self.output.writerow(self.row) |
| | 111 | |
| | 112 | def handle_field(self, obj, field): |
| | 113 | self.row.append(self.get_string_value(obj, field)) |
| | 114 | |
| | 115 | def handle_fk_field(self, obj, field): |
| | 116 | related = getattr(obj, field.name) |
| | 117 | if related is None: |
| | 118 | repr = '' |
| | 119 | else: |
| | 120 | if field.rel.field_name == related._meta.pk.name: |
| | 121 | # relation via pk |
| | 122 | repr = str(related._get_pk_val()) |
| | 123 | else: |
| | 124 | # relation via other field |
| | 125 | repr = str(getattr(related, field.rel.field_name)) |
| | 126 | self.row.append(repr) |
| | 127 | |
| | 128 | def handle_m2m_field(self, obj, field): |
| | 129 | """Represented as a tuple of related ids, or empty string of there |
| | 130 | are no related objects""" |
| | 131 | related = [related._get_pk_val() for related in getattr(obj, field.name).iterator()] |
| | 132 | if related: |
| | 133 | self.row.append(str(related)) |
| | 134 | else: |
| | 135 | self.row.append('') |
| | 136 | |
| | 137 | def get_string_value(self, obj, field): |
| | 138 | """ |
| | 139 | None always becomes ''. For string values prepend a leading |
| | 140 | space if the string contains only spaces so '' becomes ' ' and ' |
| | 141 | ' becomes ' ', etc. Other values are handled normally. |
| | 142 | """ |
| | 143 | value = getattr(obj, field.name) |
| | 144 | if value is None: |
| | 145 | return '' |
| | 146 | elif is_string_field(field): |
| | 147 | if spaces_re.match(value): |
| | 148 | return ' ' + value |
| | 149 | else: |
| | 150 | return value |
| | 151 | else: |
| | 152 | return super(Serializer, self).get_string_value(obj, field) |
| | 153 | |
| | 154 | |
| | 155 | class Deserializer(base.Deserializer): |
| | 156 | "Deserialize from csv" |
| | 157 | |
| | 158 | def __init__(self, stream_or_string, **options): |
| | 159 | super(Deserializer, self).__init__(stream_or_string, **options) |
| | 160 | self.next = self.__iter__().next |
| | 161 | |
| | 162 | def __iter__(self): |
| | 163 | header_coming = True |
| | 164 | for values in UnicodeReader(self.stream): |
| | 165 | if not values: |
| | 166 | header_coming = True |
| | 167 | else: |
| | 168 | if header_coming: |
| | 169 | # Model |
| | 170 | model, first_field = values[0].split(':', 2) |
| | 171 | try: |
| | 172 | self.model = models.get_model(*model.split(".")) |
| | 173 | except TypeError: |
| | 174 | raise base.DeserializationError("No model %s in db" % model) |
| | 175 | # Field names |
| | 176 | self.field_names = [first_field] + values[1:] |
| | 177 | header_coming = False |
| | 178 | else: |
| | 179 | # An object |
| | 180 | meta = self.model._meta |
| | 181 | data = {meta.pk.attname: meta.pk.to_python(values[0])} |
| | 182 | m2m_data = {} |
| | 183 | for i in range(1, len(values)): |
| | 184 | name = self.field_names[i] |
| | 185 | value = values[i] |
| | 186 | field = meta.get_field(name) |
| | 187 | if field.rel and isinstance(field.rel, models.ManyToManyRel): |
| | 188 | m2m_data[field.name] = self.handle_m2m_field(value, field) |
| | 189 | elif field.rel and isinstance(field.rel, models.ManyToOneRel): |
| | 190 | data[field.attname] = self.handle_fk_field(value, field) |
| | 191 | else: |
| | 192 | data[field.name] = self.handle_field(value, field) |
| | 193 | yield base.DeserializedObject(self.model(**data), m2m_data) |
| | 194 | |
| | 195 | def handle_field(self, raw, field): |
| | 196 | if raw == '': |
| | 197 | raw = None |
| | 198 | elif is_string_field(field): |
| | 199 | if spaces_re.match(raw): |
| | 200 | raw = raw[1:] |
| | 201 | return field.to_python(raw) |
| | 202 | |
| | 203 | def handle_fk_field(self, raw, field): |
| | 204 | if raw == '': |
| | 205 | return None |
| | 206 | related_field = field.rel.to._meta.get_field(field.rel.field_name) |
| | 207 | return related_field.to_python(raw) |
| | 208 | |
| | 209 | def handle_m2m_field(self, raw, field): |
| | 210 | if raw: |
| | 211 | return eval(raw) |
| | 212 | else: |
| | 213 | return [] |
| | 214 | |
| | 215 | |
| | 216 | def is_string_field(field): |
| | 217 | """If all field classes working with strings extended CharField, we |
| | 218 | wouldn't need this method""" |
| | 219 | |
| | 220 | string_types = ('CharField', 'FileField', 'FilePathField', 'SlugField', |
| | 221 | 'TextField', 'USStateField') |
| | 222 | |
| | 223 | for s in string_types: |
| | 224 | if field.__class__ == s: |
| | 225 | #print "%s is of type %s" % (field, s) |
| | 226 | return True |
| | 227 | |
| | 228 | return False |
| | 229 | |
| | 230 | # Copied from csv module examples with some modifications |
| | 231 | # - getincrementalencoder replaced with getencoder because it works with |
| | 232 | # python < 2.5 |
| | 233 | |
| | 234 | class UTF8Recoder: |
| | 235 | """ |
| | 236 | Iterator that reads an encoded stream and reencodes the input to UTF-8 |
| | 237 | """ |
| | 238 | def __init__(self, f, encoding): |
| | 239 | self.reader = codecs.getreader(encoding)(f) |
| | 240 | |
| | 241 | def __iter__(self): |
| | 242 | return self |
| | 243 | |
| | 244 | def next(self): |
| | 245 | return self.reader.next().encode("utf-8") |
| | 246 | |
| | 247 | class UnicodeReader: |
| | 248 | """ |
| | 249 | A CSV reader which will iterate over lines in the CSV file "f", |
| | 250 | which is encoded in the given encoding. |
| | 251 | """ |
| | 252 | |
| | 253 | def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): |
| | 254 | f = UTF8Recoder(f, encoding) |
| | 255 | self.reader = csv.reader(f, dialect=dialect, **kwds) |
| | 256 | |
| | 257 | def next(self): |
| | 258 | row = self.reader.next() |
| | 259 | return [unicode(s, "utf-8") for s in row] |
| | 260 | |
| | 261 | def __iter__(self): |
| | 262 | return self |
| | 263 | |
| | 264 | class UnicodeWriter: |
| | 265 | """ |
| | 266 | A CSV writer which will write rows to CSV file "f", |
| | 267 | which is encoded in the given encoding. |
| | 268 | """ |
| | 269 | |
| | 270 | def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): |
| | 271 | # Redirect output to a queue |
| | 272 | self.queue = StringIO() |
| | 273 | self.writer = csv.writer(self.queue, dialect=dialect, **kwds) |
| | 274 | self.stream = f |
| | 275 | self.encode = codecs.getencoder(encoding) |
| | 276 | |
| | 277 | def writerow(self, row): |
| | 278 | #self.writer.writerow([s.encode("utf-8") for s in row]) |
| | 279 | self.writer.writerow([s for s in row]) |
| | 280 | # Fetch UTF-8 output from the queue ... |
| | 281 | data = self.queue.getvalue() |
| | 282 | data = data.decode("utf-8") |
| | 283 | # ... and reencode it into the target encoding |
| | 284 | data = self.encode(data)[0] |
| | 285 | # write to the target stream |
| | 286 | self.stream.write(data) |
| | 287 | # empty queue |
| | 288 | self.queue.truncate(0) |
| | 289 | |
| | 290 | def writerows(self, rows): |
| | 291 | for row in rows: |
| | 292 | self.writerow(row) |