forked from WilliamQLiu/python-examples
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcsv_processing.py
More file actions
149 lines (117 loc) · 5.65 KB
/
csv_processing.py
File metadata and controls
149 lines (117 loc) · 5.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
"""
There's many ways to process csv files, each with its own adv/disadv.
You can use csv loops, pandas' DataFrame.read_csv, or create a class.
The csv module doesn't support encoding, reading, or writing data from
a file so we use the `unicodecsv` module when we can; this helps
translate the native file to 'UTF-8', which the csv module can read.
Basics of CSV Processing from documentation and tutorial:
https://docs.python.org/2/library/csv.html
https://districtdatalabs.silvrback.com/simple-csv-data-wrangling-with-python
"""
import pandas as pd
import collections # high performance container datatypes
import sys # for warnings
try:
import unicodecsv as csv # for UTF8Recoder, UnicodeReader, UnicodeWriter classes
except ImportError:
import warnings
warnings.warn("can't import `unicodecsv` encoding errors may occur")
import csv
def read_simple_data(path):
""" How to read a csv file lazily (i.e. no more than one row is in mem),
Few things to note:
* mode 'rU' is opening file in universal newline mode
* use csv.DictReader only when headers present, otherwise csv.reader
* using `with [callable] as [name]` ensures handle automatically closes
"""
with open(path, 'rU') as data: # wrap reader in a function that returns a generator
reader = csv.DictReader(data) # use csv.reader if no header
for row in reader:
yield row # yield returns the generator, don't need to upload all in memory
def read_data_catch_errors(path):
""" Same as above, but catches and reports on errors """
with open(path, 'rU') as data: # also can try rb
reader = csv.DictReader(data)
try:
for row in reader:
yield row
except csv.Error as e:
sys.exit('File %s, line %d: %s' % (path, reader.line_num, e))
def write_simple_data(path):
""" Write data instead of read data """
with open(path, 'wb') as data:
writer = csv.writer(data)
for row in writer:
writer.writerows(row)
class FundingReader(object):
""" A class can save some state of the data between reads """
def __init__(self, path):
self.path = path
self._length = None
self._counter = None
def __iter__(self):
self._length = 0
self._counter = collections.Counter()
with open(self.path, 'rU') as data:
reader = csv.DictReader(data)
for row in reader:
# Save these statistics
self._length += 1
self._counter[row['company']] += 1
yield row
def __len__(self):
if self._length is None:
for row in self: continue # Read the data for length and counter
return self._length
@property
def counter(self):
if self._counter is None:
for row in self: continue # Read the data for length and counter
return self._counter
@property
def companies(self):
return self.counter.keys()
def reset(self):
""" In case of partial seeks (e.g. breaking in the middle of the read)
"""
self._length = None
self._counter = None
### Adds support for reading and writing Unicode (UTF-8, not UTF-16)
### Apply by using these as a wrapped function or just use `unicodecsv`
# https://docs.python.org/2/library/csv.html
def unicode_csv_reader(unicode_csv_data, dialect=csv.excel, **kwargs):
# csv.py doesn't do Unicode; encode temporarily as UTF-8:
csv_reader = csv.reader(utf_8_encoder(unicode_csv_data),
dialect=dialect, **kwargs)
for row in csv_reader:
# decode UTF-8 back to Unicode, cell by cell:
yield [unicode(cell, 'utf-8') for cell in row]
def utf_8_encoder(unicode_csv_data):
for line in unicode_csv_data:
yield line.encode('utf-8')
### Note: recommend just importing `unicodecsv` instead of these two methods
if __name__ == '__main__':
### READING DATA
mypath = 'funding.csv'
#|-----------+------------+-------- +---------+------------+-------+------------+-----------+----------------+------|
#|permalink | company | numEmps | category| city | state | fundedDate | raisedAmt | raisedCurrency | round|
#|-----------+------------+-------- +---------+------------+-------+------------+-----------+----------------+------|
#|lifelock | LifeLock | | web | Tempe | AZ | 1-May-07 | 6850000 | USD | b |
#|lifelock | LifeLock | | web | Tempe | AZ | 1-Oct-06 | 6000000 | USD | a |
#|lifelock | LifeLock | | web | Tempe | AZ | 1-Jan-08 | 25000000 | USD | c |
#|mycityfaces| MyCityFaces| 7 | web | Scottsdale | AZ | 1-Jan-08 | 50000 | USD | seed |
### Using simple csv method - Print first 11 items - shows we can read the file using our function
for index, row in enumerate(read_simple_data(mypath)):
if index > 10: break
print "%(company)s (%(numEmps)s employees) raised %(raisedAmt)s on %(fundedDate)s" % row
### Using same as above, but catches and returns errors
for index, row in enumerate(read_data_catch_errors(mypath)):
if index > 10: break
print "%(company)s (%(numEmps)s employees) raised %(raisedAmt)s on %(fundedDate)s" % row
### Using pandas.read_csv method
#df = pd.read_csv(filepath_or_buffer=mypath, sep=",")
#print df.head()
### Using csv class
reader = FundingReader(mypath)
print "Funding reader with %i rows and %i companies" % (len(reader), len(reader.companies))
### WRITING