Ticket #5253: csv_serializer.diff

File csv_serializer.diff, 11.5 KB (added by Adam Schmideg <adam@…>, 8 years ago)

An implementation that works with serializers_regress test ready for inclusion

  • django/core/serializers/__init__.py

     
    2323    "xml"    : "django.core.serializers.xml_serializer",
    2424    "python" : "django.core.serializers.python",
    2525    "json"   : "django.core.serializers.json",
     26    "csv"    : "django.core.serializers.csv_serializer",
    2627}
    2728
    2829# Check for PyYaml and register the serializer if it's available.
  • django/core/serializers/csv_serializer.py

     
     1"""
     2Serialize multiple table data to and from a single csv stream, using the
     3standard csv module.
     4
     5The format of csv is sort of standardized in rfc4180, stating that there
     6are more implementations, even incompatible ones.  It treats headers as
     7optional where column names are separated the same way as field values.
     8It leaves some important questions open,
     9 - how to handle null values as opposed to empty strings,
     10 - how to handle relations, such as foreign keys or many-to-many
     11   relations,
     12 - how to represent multiple tables in a single csv file.
     13
     14The latter issue is addressed in Creativyst's ctx format at
     15http://www.creativyst.com/Doc/Std/ctx/ctx.htm where a multiple-line
     16header is used to describe metadata.  I didn't want to use their
     17approach because it conflicts with existing csv tools (such as the
     18python csv module) for simpler cases.
     19
     20Let's start with an example what csv this module produces and
     21understands.
     22
     23news_author:registration_number,name
     24555001,Jack
     25555002,Jill
     26
     27news_article:id,authors,title,text,published
     281,"[1, 2]",On Life And Chees,Once upon a time...,2001-05-30
     292,[2],,I should write this,
     30
     31Here is a summary of how values are represented.
     32 - Tables are separated by two lineterminators because it's not
     33   intrusive and gives a good visual guidance.  It's simply parsed as an
     34   empty line by csv tools, preserving the structure.  A single csv file
     35   is also easy to split by the separator using csplit for example.
     36 - Headers are mandatory, containing the column names separated by
     37   commas.
     38 - The first header field is special, it has the form '<table name>:<pk
     39   name>'.  This doesn't conflict with other parsers; and the colon as
     40   separator is widely used in the Unix world and it cannot be part of
     41   the table or column name.  The usage of <pk name> instead of just
     42   'pk' is intentional, although it differs from the constant usage of
     43   'pk' is the json an xml serializers modules -- this is how database
     44   dumps work, for example in sqlite.
     45 - None is represented as an empty string.
     46 - Foreign keys are represented as integers.
     47 - Many-to-many relations are represented as a list of foreign keys.
     48 - Strings are represented as they are except for strings that contain
     49   only zero or more spaces.
     50 - Strings of only zero or more spaces are prepended an extra leading
     51   space, so '' becomes ' ', ' ' becomes '  ', etc.  This may look
     52   strange first but this is how None (represented as '') and ''
     53   (represented as ' ') are distinguished.  Space-only strings are a
     54   rare beast, leading and trailing spaces are also frequently trimmed
     55   by csv parsers, so I find this a fair compromise.
     56"""
     57import codecs
     58import csv
     59try:
     60    from cStringIO import StringIO
     61except ImportError:
     62    from StringIO import StringIO
     63import os
     64import re
     65
     66from django.core.serializers import base
     67from django.db import models
     68# These fields should all extend CharField since they all work with
     69# string data
     70from django.db.models.fields import CharField, FileField, \
     71  FilePathField, SlugField, TextField, USStateField
     72
     73
     74spaces_re = re.compile('^[ ]*$')
     75
     76class Serializer(base.Serializer):
     77    "Serialize to csv"
     78
     79    def start_serialization(self):
     80        self.last_model = None
     81        # By default, csv module uses '\r\n' as lineterminator
     82        self.output = UnicodeWriter(self.stream, lineterminator=os.linesep)
     83
     84    def start_object(self, obj):
     85        if not hasattr(obj, "_meta"):
     86            raise base.SerializationError("Non-model object (%s) encountered during serialization" % type(obj))
     87        if self.last_model != obj._meta:
     88            meta = obj._meta
     89            self.last_model = meta
     90            fields = self.selected_fields
     91            if fields:
     92                fields = list(fields)
     93            else:
     94                fields = \
     95                    [field.name for field in meta.fields] + \
     96                    [field.name for field in meta.many_to_many]
     97            if meta.pk.attname in fields:
     98                fields.remove(meta.pk.attname)
     99            header = ['%s:%s' % (meta, meta.pk.attname)]
     100            for field_name in fields:
     101                header.append(field_name)
     102            # Table separator is an empty row
     103            self.output.writerow([])
     104            self.output.writerow(header)
     105        self.row = [str(obj._get_pk_val())]
     106
     107    def end_object(self, obj):
     108        self.output.writerow(self.row)
     109
     110    def handle_field(self, obj, field):
     111        self.row.append(self.get_string_value(obj, field))
     112
     113    def handle_fk_field(self, obj, field):
     114        related = getattr(obj, field.name)
     115        if related is None:
     116            repr = ''
     117        else:
     118            if field.rel.field_name == related._meta.pk.name:
     119                # relation via pk
     120                repr = str(related._get_pk_val())
     121            else:
     122                # relation via other field
     123                repr = str(getattr(related, field.rel.field_name))
     124        self.row.append(repr)
     125
     126    def handle_m2m_field(self, obj, field):
     127        """Represented as a tuple of related ids, or empty string of there
     128        are no related objects"""
     129        related = [related._get_pk_val() for related in getattr(obj, field.name).iterator()]
     130        if related:
     131            self.row.append(str(related))
     132        else:
     133            self.row.append('')
     134
     135    def get_string_value(self, obj, field):
     136        """
     137        None always becomes ''.  For string values prepend a leading
     138        space if the string contains only spaces so '' becomes ' ' and '
     139        ' becomes '  ', etc.  Other values are handled normally.
     140        """
     141        value = getattr(obj, field.name)
     142        if value is None:
     143            return ''
     144        elif is_string_field(field):
     145            if spaces_re.match(value):
     146                return ' ' + value
     147            else:
     148                return value
     149        else:
     150            return super(Serializer, self).get_string_value(obj, field)
     151
     152
     153class Deserializer(base.Deserializer):
     154    "Deserialize from csv"
     155
     156    def __init__(self, stream_or_string, **options):
     157        super(Deserializer, self).__init__(stream_or_string, **options)
     158        self.next = self.__iter__().next
     159
     160    def __iter__(self):
     161        header_coming = True
     162        for values in UnicodeReader(self.stream):
     163            if not values:
     164                header_coming = True
     165            else:
     166                if header_coming:
     167                    # Model
     168                    model, first_field = values[0].split(':', 2)
     169                    try:
     170                        self.model = models.get_model(*model.split("."))
     171                    except TypeError:
     172                        raise base.DeserializationError("No model %s in db" % model)
     173                    # Field names
     174                    self.field_names = [first_field] + values[1:]
     175                    header_coming = False
     176                else:
     177                    # An object
     178                    meta = self.model._meta
     179                    data = {meta.pk.attname: meta.pk.to_python(values[0])}
     180                    m2m_data = {}
     181                    for i in range(1, len(values)):
     182                        name = self.field_names[i]
     183                        value = values[i]
     184                        field = meta.get_field(name)
     185                        if field.rel and isinstance(field.rel, models.ManyToManyRel):
     186                            m2m_data[field.name] = self.handle_m2m_field(value, field)
     187                        elif field.rel and isinstance(field.rel, models.ManyToOneRel):
     188                            data[field.attname] = self.handle_fk_field(value, field)
     189                        else:
     190                            data[field.name] = self.handle_field(value, field)
     191                    yield base.DeserializedObject(self.model(**data), m2m_data)
     192
     193    def handle_field(self, raw, field):
     194        if raw == '':
     195            raw = None
     196        elif is_string_field(field):
     197            if spaces_re.match(raw):
     198                raw = raw[1:]
     199        return field.to_python(raw)
     200
     201    def handle_fk_field(self, raw, field):
     202        if raw == '':
     203            return None
     204        related_field = field.rel.to._meta.get_field(field.rel.field_name)
     205        return related_field.to_python(raw)
     206
     207    def handle_m2m_field(self, raw, field):
     208        if raw:
     209            return eval(raw)
     210        else:
     211            return []
     212
     213
     214def is_string_field(field):
     215    """If all field classes working with strings extended CharField, we
     216    wouldn't need this method"""
     217    return bool(isinstance(field,
     218        (CharField, FileField, FilePathField, SlugField, TextField,
     219        USStateField)))
     220
     221
     222# Copied from csv module examples with some modifications
     223# - getincrementalencoder replaced with getencoder because it works with
     224# python < 2.5
     225
     226class UTF8Recoder:
     227    """
     228    Iterator that reads an encoded stream and reencodes the input to UTF-8
     229    """
     230    def __init__(self, f, encoding):
     231        self.reader = codecs.getreader(encoding)(f)
     232
     233    def __iter__(self):
     234        return self
     235
     236    def next(self):
     237        return self.reader.next().encode("utf-8")
     238
     239class UnicodeReader:
     240    """
     241    A CSV reader which will iterate over lines in the CSV file "f",
     242    which is encoded in the given encoding.
     243    """
     244
     245    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
     246        f = UTF8Recoder(f, encoding)
     247        self.reader = csv.reader(f, dialect=dialect, **kwds)
     248
     249    def next(self):
     250        row = self.reader.next()
     251        return [unicode(s, "utf-8") for s in row]
     252
     253    def __iter__(self):
     254        return self
     255
     256class UnicodeWriter:
     257    """
     258    A CSV writer which will write rows to CSV file "f",
     259    which is encoded in the given encoding.
     260    """
     261
     262    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
     263        # Redirect output to a queue
     264        self.queue = StringIO()
     265        self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
     266        self.stream = f
     267        self.encode = codecs.getencoder(encoding)
     268
     269    def writerow(self, row):
     270        self.writer.writerow([s.encode("utf-8") for s in row])
     271        # Fetch UTF-8 output from the queue ...
     272        data = self.queue.getvalue()
     273        data = data.decode("utf-8")
     274        # ... and reencode it into the target encoding
     275        data = self.encode(data)[0]
     276        # write to the target stream
     277        self.stream.write(data)
     278        # empty queue
     279        self.queue.truncate(0)
     280
     281    def writerows(self, rows):
     282        for row in rows:
     283            self.writerow(row)
Back to Top