python-examples/csv/csv_processing.py at master · ImageRec/python-examples

149 lines (117 loc) · 5.65 KB
    There's many ways to process csv files, each with its own adv/disadv.
    You can use csv loops, pandas' DataFrame.read_csv, or create a class.
    The csv module doesn't support encoding, reading, or writing data from
    a file so we use the `unicodecsv` module when we can; this helps
    translate the native file to 'UTF-8', which the csv module can read.
    Basics of CSV Processing from documentation and tutorial:
    https://docs.python.org/2/library/csv.html
    https://districtdatalabs.silvrback.com/simple-csv-data-wrangling-with-python
import pandas as pd
import collections  # high performance container datatypes
import sys # for warnings
    import unicodecsv as csv  # for UTF8Recoder, UnicodeReader, UnicodeWriter classes
except ImportError:
    import warnings
    warnings.warn("can't import `unicodecsv` encoding errors may occur")
    import csv
def read_simple_data(path):
    """ How to read a csv file lazily (i.e. no more than one row is in mem),
        Few things to note:
        * mode 'rU' is opening file in universal newline mode
        * use csv.DictReader only when headers present, otherwise csv.reader
        * using `with [callable] as [name]` ensures handle automatically closes
    with open(path, 'rU') as data:  # wrap reader in a function that returns a generator
        reader = csv.DictReader(data)  # use csv.reader if no header
        for row in reader:
            yield row  # yield returns the generator, don't need to upload all in memory
def read_data_catch_errors(path):
    """ Same as above, but catches and reports on errors """
    with open(path, 'rU') as data:  # also can try rb
        reader = csv.DictReader(data)
        try:
            for row in reader:
                yield row
        except csv.Error as e:
            sys.exit('File %s, line %d: %s' % (path, reader.line_num, e))
def write_simple_data(path):
    """ Write data instead of read data """
    with open(path, 'wb') as data:
        writer = csv.writer(data)
        for row in writer:
            writer.writerows(row)
class FundingReader(object):
    """ A class can save some state of the data between reads """
    def __init__(self, path):
        self.path = path
        self._length = None
        self._counter = None
    def __iter__(self):
        self._length = 0
        self._counter = collections.Counter()
        with open(self.path, 'rU') as data:
            reader = csv.DictReader(data)
            for row in reader:
                # Save these statistics
                self._length += 1
                self._counter[row['company']] += 1
                yield row
    def __len__(self):
        if self._length is None:
            for row in self: continue  # Read the data for length and counter
        return self._length
    @property
    def counter(self):
        if self._counter is None:
            for row in self: continue  # Read the data for length and counter
        return self._counter
    @property
    def companies(self):
        return self.counter.keys()
    def reset(self):
        """ In case of partial seeks (e.g. breaking in the middle of the read)
        """
        self._length = None
        self._counter = None
### Adds support for reading and writing Unicode (UTF-8, not UTF-16)
### Apply by using these as a wrapped function or just use `unicodecsv`
#  https://docs.python.org/2/library/csv.html
def unicode_csv_reader(unicode_csv_data, dialect=csv.excel, **kwargs):
    # csv.py doesn't do Unicode; encode temporarily as UTF-8:
    csv_reader = csv.reader(utf_8_encoder(unicode_csv_data),
                            dialect=dialect, **kwargs)
    for row in csv_reader:
        # decode UTF-8 back to Unicode, cell by cell:
        yield [unicode(cell, 'utf-8') for cell in row]
def utf_8_encoder(unicode_csv_data):
    for line in unicode_csv_data:
        yield line.encode('utf-8')
### Note: recommend just importing `unicodecsv` instead of these two methods
if __name__ == '__main__':
    ### READING DATA
    mypath = 'funding.csv'
    #|-----------+------------+-------- +---------+------------+-------+------------+-----------+----------------+------|
    #|permalink  | company    | numEmps | category| city       | state | fundedDate | raisedAmt | raisedCurrency | round|
    #|-----------+------------+-------- +---------+------------+-------+------------+-----------+----------------+------|
    #|lifelock   | LifeLock   |         | web     | Tempe      | AZ    | 1-May-07   | 6850000   | USD            | b    |
    #|lifelock   | LifeLock   |         | web     | Tempe      | AZ    | 1-Oct-06   | 6000000   | USD            | a    |
    #|lifelock   | LifeLock   |         | web     | Tempe      | AZ    | 1-Jan-08   | 25000000  | USD            | c    |
    #|mycityfaces| MyCityFaces| 7       | web     | Scottsdale | AZ    | 1-Jan-08   | 50000     | USD            | seed |
    ### Using simple csv method - Print first 11 items - shows we can read the file using our function
    for index, row in enumerate(read_simple_data(mypath)):
        if index > 10: break
        print "%(company)s (%(numEmps)s employees) raised %(raisedAmt)s on %(fundedDate)s" % row
    ### Using same as above, but catches and returns errors
    for index, row in enumerate(read_data_catch_errors(mypath)):
        if index > 10: break
        print "%(company)s (%(numEmps)s employees) raised %(raisedAmt)s on %(fundedDate)s" % row
    ### Using pandas.read_csv method
    #df = pd.read_csv(filepath_or_buffer=mypath, sep=",")
    #print df.head()
    ### Using csv class
    reader = FundingReader(mypath)
    print "Funding reader with %i rows and %i companies" % (len(reader), len(reader.companies))
    ### WRITING
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

csv_processing.py

Latest commit

History

csv_processing.py

File metadata and controls