Source code for odin.codecs.csv_codec

"""
CSV Codec
~~~~~~~~~

Codec for iterating a CSV file and parsing into a Resource.

The CSV codec is codec that yields multiple resources rather than a single document.
The CSV codec does not support nesting of resources.

Reading data from a CSV file::

    with open("my_file.csv") as f:
        with resource in csv_codec.reader(f, MyResource):
            ...

"""
import csv
from io import StringIO

from odin import bases
from odin.datastructures import CaseLessStringList
from odin.exceptions import CodecDecodeError, ValidationError
from odin.fields import NotProvided
from odin.resources import create_resource_from_iter
from odin.utils import getmeta, lazy_property

CONTENT_TYPE = "text/csv"


[docs] class Reader(bases.TypedResourceIterable): """ Customisable reader object. """ csv_reader = csv.reader """ CSV Reader object to use (if you wish to use *unicodecsv* or similar) """ includes_header = True """ File is expected to include a header. """ ignore_header_case = False """ Use case-less comparison on header fields. """ strict_fields = False """ Strictly check header fields. """ csv_dialect = "excel" """ CSV Dialect to use; defaults to the CSV libraries default value of *excel*. """ default_empty_value = "" """ The default value to use if a field is empty. This can be used to default to *None*. """ def __init__( self, f, resource_type, full_clean=True, error_callback=None, **reader_kwargs ): """ Initialise a reader :param f: Input file (or file like) object to read :param resource_type: Resource type to use as field template. :param full_clean: Perform a full clean on objects :param error_callback: Optional callback for errors :param reader_kwargs: kwargs to pass to the csv_reader """ super().__init__(resource_type) self.full_clean = full_clean if error_callback: self.handle_validation_error = error_callback # Backwards compatibility for arg in ( "csv_reader", "includes_header", "ignore_header_case", "strict_fields", "csv_dialect", ): if arg in reader_kwargs: setattr(self, arg, reader_kwargs.pop(arg)) # Create reader instance self._reader = self._create_reader(f, reader_kwargs) # Configure header if self.includes_header: self.header = self._read_header() # Handle strict fields if self.strict_fields and self.extra_field_names: raise CodecDecodeError( "Extra unknown fields: {}".format(",".join(self.extra_field_names)) ) # Built in counters self.row_count = None self.error_count = None def __iter__(self): # Reset error count self.error_count = 0 # Local vars resource = self.resource_type full_clean = self.full_clean default_empty_value = self.default_empty_value handle_validation_error = getattr(self, "handle_validation_error", None) idx = -1 def create_resource(values, i): try: return create_resource_from_iter( # Handle empty values (default_empty_value if v == "" else v for v in values), resource, full_clean, ) except ValidationError as ve: # Don't raise these through yield as will cause a StopIteration # even if validation error can be handled safely. self.error_count += 1 if not handle_validation_error: raise # If handle error explicitly returns False raise exception if handle_validation_error(ve, i) is False: raise if self.includes_header: mapping = self.field_mapping for idx, row in enumerate(self._reader): # Check if row is less than mapping (as this will causes errors)! res = create_resource( (s if s is NotProvided else row[s] for s in mapping), idx + 1 ) # Add one to index as row "0" will be the header if res: yield res else: for idx, row in enumerate(self._reader): res = create_resource(row, idx) if res: yield res self.row_count = idx + 1 # Add one to get a count from the last index def _create_reader(self, f, kwargs): """ Create internal reader instance :param f: File (or file like) object :param kwargs: Dictionary of additional keyword args :return: Reader instance """ return self.csv_reader(f, self.csv_dialect, **kwargs) def _read_header(self): """ Get the header, this needs to be called **once** only! """ header = next(self._reader) if self.ignore_header_case: header = CaseLessStringList(header) return header @lazy_property def field_names(self): """ Field names from resource. """ fields = getmeta(self.resource_type).fields if self.ignore_header_case: return CaseLessStringList(field.name for field in fields) else: return tuple(field.name for field in fields) @lazy_property def extra_field_names(self): """ Extra fields not included in header """ return tuple(field for field in self.header if field not in self.field_names) @lazy_property def field_mapping(self): """ Index mapping of CSV fields to resource fields. """ mapping = [] # Add expected fields header = self.header for name in self.field_names: if name in header: mapping.append(header.index(name)) else: mapping.append(NotProvided) # Append any extra fields for name in self.extra_field_names: mapping.append(header.index(name)) return tuple(mapping)
[docs] def reader( f, resource, includes_header=False, csv_module=csv, full_clean=True, ignore_header_case=False, strict_fields=False, **kwargs ): """ CSV reader that returns resource objects :param f: file like object :param resource: :param includes_header: File includes a header that should be used to map columns :param csv_module: Specify an alternate csv module (eg unicodecsv); defaults to the builtin csv as this module is implemented in C. :param full_clean: Perform a full clean on each object :param ignore_header_case: Ignore the letter case on header :param strict_fields: Extra fields cannot be provided. :return: Iterable reader object :rtype: Reader """ return Reader( f, resource, full_clean, csv_reader=csv_module.reader, includes_header=includes_header, ignore_header_case=ignore_header_case, strict_fields=strict_fields, **kwargs )
[docs] def value_fields(resource): """ Iterator to get non-composite (eg value) fields for export """ meta = getmeta(resource) return [f for f in meta.all_fields if f not in meta.composite_fields]
def _get_resource_type(resources, resource_type): if isinstance(resources, bases.TypedResourceIterable): # Use first resource to obtain field list return resource_type or resources.resource_type elif isinstance(resources, bases.ResourceIterable) and resource_type: return resource_type elif isinstance(resources, (list, tuple)): if not len(resources): return # Use first resource to obtain field list return resource_type or resources[0] else: raise Exception("Not supported input format")
[docs] def dump_to_writer(writer, resources, resource_type=None, fields=None): """ Dump resources to a CSV writer interface. The interface should expose the :py:class:`csv.writer` interface. :type writer: :py:class:`csv.writer` :param writer: Writer object :param fields: List of fields to write :param resources: Collection of resources to dump. :param resource_type: Resource type to use for CSV columns; if None the first resource will be used. :returns: List of fields that where written to. """ resource_type = resource_type or _get_resource_type(resources, resource_type) if not fields: fields = value_fields(resource_type) # Iterate resources and write to CSV for resource in resources: row = [field.prepare(field.value_from_object(resource)) for field in fields] writer.writerow(row) return fields
[docs] def dump( f, resources, resource_type=None, include_header=True, cls=csv.writer, **kwargs ): """ Dump resources into a CSV file. :param f: File to dump to. :param resources: Collection of resources to dump. :param resource_type: Resource type to use for CSV columns; if None the first resource will be used. :param include_header: Write a CSV header. :param cls: Writer to use when writing CSV, this should be based on :class:`csv.writer`. :param kwargs: Additional parameters to be supplied to the writer instance. """ resource_type = _get_resource_type(resources, resource_type) fields = value_fields(resource_type) # Setup CSV writer = cls(f, **kwargs) # Write header if include_header: writer.writerow([field.name for field in fields]) dump_to_writer(writer, resources, resource_type, fields)
[docs] def dumps(resources, resource_type=None, cls=csv.writer, **kwargs): """ Dump output to a string :param resources: :param resources: Collection of resources to dump. :param resource_type: Resource type to use for CSV columns; if None the first resource will be used. :param cls: Writer to use when writing CSV, this should be based on :class:`csv.writer`. :param kwargs: Additional parameters to be supplied to the writer instance. """ buf = StringIO() dump(buf, resources, resource_type=resource_type, cls=cls, **kwargs) return buf.getvalue()