Dict Format:
{'Title': 'Please Please Me', 'UK Chart Position': '1', 'Label': 'Parlophone(UK)',
'Released': '22 March 1963', 'US Chart Position': '-', 'RIAA Certification': 'Platinum',
'BPI Certification': 'Gold'}
Use first line of the file as a header of dict.
def parse_file(datafile):
data = []
with open(datafile, "rb") as f:
header = f.readline().split(",")
counter = 0
for line in f:
if counter == 10:
break
fields = line.split(",")
entry = {}
for i, value in enumerate(fields):
entry[header[i].strip()] = value.strip()
data.append(entry)
counter += 1
return data
def parse_file(datafile):
data = []
with open(datafile, "r") as f:
header = f.readline().split(",")
for line in f:
line_dict = {}
line_walk = 0
for key in header:
line_dict[key.strip()] = line.split(',')[line_walk].strip()
line_walk +=1
data.append(line_dict)
return data[0:10]
import csv
def parse_file(datafile):
data = []
with open(datafile, "r") as sd:
r = csv.DictReader(sd)
for line in r:
data.append(line)
return data[0:10]
import os
DATADIR = ""
DATAFILE = "beatles-diskography.csv"
def test():
# a simple test of your implemetation
datafile = os.path.join(DATADIR, DATAFILE)
d = parse_file(datafile)
firstline = {'Title': 'Please Please Me', 'UK Chart Position': '1', 'Label': 'Parlophone(UK)', 'Released': '22 March 1963', 'US Chart Position': '-', 'RIAA Certification': 'Platinum', 'BPI Certification': 'Gold'}
tenthline = {'Title': '', 'UK Chart Position': '1', 'Label': 'Parlophone(UK)', 'Released': '10 July 1964', 'US Chart Position': '-', 'RIAA Certification': '', 'BPI Certification': 'Gold'}
assert d[0] == firstline
assert d[9] == tenthline
test()
pip install xlrd
import xlrd
datafile = "2013_ERCOT_Hourly_Load_Data.xls"
def parse_file(datafile):
workbook = xlrd.open_workbook(datafile)
sheet = workbook.sheet_by_index(0)
data = [[sheet.cell_value(r, col)
for col in range(sheet.ncols)]
for r in range(sheet.nrows)]
print "\nList Comprehension"
print "data[3][2]:",
print data[3][2]
print "\nCells in a nested loop:"
for row in range(sheet.nrows):
for col in range(sheet.ncols):
if row == 50:
print sheet.cell_value(row, col),
### other useful methods:
print "\nROWS, COLUMNS, and CELLS:"
print "Number of rows in the sheet:",
print sheet.nrows
print "Type of data in cell (row 3, col 2):",
print sheet.cell_type(3, 2)
print "Value in cell (row 3, col 2):",
print sheet.cell_value(3, 2)
print "Get a slice of values in column 3, from rows 1-3:"
print sheet.col_values(3, start_rowx=1, end_rowx=4)
print "\nDATES:"
print "Type of data in cell (row 1, col 0):",
print sheet.cell_type(1, 0)
exceltime = sheet.cell_value(1, 0)
print "Time in Excel format:",
print exceltime
print "Convert time to a Python datetime tuple, from the Excel float:",
print xlrd.xldate_as_tuple(exceltime, 0)
return data
data = parse_file(datafile)
"""
Your task is as follows:
- read the provided Excel file
- find and return the min, max and average values for the COAST region
- find and return the time value for the min and max entries
- the time values should be returned as Python tuples
Please see the test function for the expected return format
"""
import numpy as np
import xlrd
from zipfile import ZipFile
datafile = "2013_ERCOT_Hourly_Load_Data.xls"
def open_zip(datafile):
with ZipFile(re.sub('.xls','','{0}.zip'.format(datafile)), 'r') as myzip:
myzip.extractall()
def parse_file(datafile):
workbook = xlrd.open_workbook(datafile)
sheet = workbook.sheet_by_index(0)
coast = sheet.col_values(1,start_rowx=1, end_rowx=None)
maxpos = coast.index(max(coast))
minpos = coast.index(min(coast))
data = {
'maxtime': xlrd.xldate_as_tuple(sheet.cell_value(maxpos+1,0),0),
'maxvalue': max(coast),
'mintime': xlrd.xldate_as_tuple(sheet.cell_value(minpos+1,0),0),
'minvalue': min(coast),
'avgcoast': np.mean(coast)
}
return data
def test():
open_zip(datafile)
data = parse_file(datafile)
assert data['maxtime'] == (2013, 8, 13, 17, 0, 0)
assert round(data['maxvalue'], 10) == round(18779.02551, 10)
test()
Two most importain functions in Python JSON are : .loads() and .dumps()
.loads() : it converts JSON_string(string type but content looks like JSON) to dict. Purpose: string is dull and non-loop-able, convert to dict for easy management and handling.
-Ex:
json_string = '{"first_name": "Guido", "last_name":"Rossum"}'
import json
parsed_json = json.loads(json_string)
print parsed_json
print "=========================="
print type(parsed_json)
Output:
{u'first_name': u'Guido', u'last_name': u'Rossum'}
==========================
<type 'dict'>
.dumps() : it converts dict to JSON_string(string type but content looks like JSON). Purpose: it does not make much sense if we only convert a dict to string(JSON) and do nothing else; the .dumps() contains some fields such as indent or sort_keys. It helps to print the JSON_string in a customized pretty way.
-Ex:
d = {
'first_name': 'Guido',
'second_name': 'Rossum',
'titles': ['BDFL', 'Developer'],
}
print "==================================== BEFORE ========================================="
print d,"\n"
print "===================================== AFTER ========================================="
def pretty_print(data, indent=4):
# After we get our output, we can format it to be more readable
# by using this function.
if type(data) == dict:
print json.dumps(data, indent=indent, sort_keys=True)
else:
print data
pretty_print(d)
Output:
==================================== BEFORE =========================================
{'first_name': 'Guido', 'titles': ['BDFL', 'Developer'], 'second_name': 'Rossum'}
===================================== AFTER =========================================
{
"first_name": "Guido",
"second_name": "Rossum",
"titles": [
"BDFL",
"Developer"
]
}
There are some important things in requests module that we need to focus : requests.get(url, params) , object.status_code == requests.codes.ok , object.json() with object.raise_for_status() . You understand them all by the following examples:
.loads() : it converts JSON_string(string type but content looks like JSON) to dict. Purpose: string is dull and non-loop-able, convert to dict for easy management and handling.
-Ex:
import json
import requests
BASE_URL = "http://musicbrainz.org/ws/2/"
ARTIST_URL = BASE_URL + "artist/"
# query parameters are given to the requests.get function as a dictionary; this
# variable contains some starter parameters.
query_type = { "simple": {},
"atr": {"inc": "aliases+tags+ratings"},
"aliases": {"inc": "aliases"},
"releases": {"inc": "releases"}}
def query_site(url, params, uid="", fmt="json"):
# This is the main function for making queries to the musicbrainz API.
# A json document should be returned by the query.
params["fmt"] = fmt
r = requests.get(url + uid, params=params)
print "params:", params
print "requesting", r.url
if r.status_code == requests.codes.ok:
return r.json()
else:
r.raise_for_status()
def query_by_name(url, params, name):
# This adds an artist name to the query parameters before making
# an API call to the function above.
params["query"] = "artist:" + name
return query_site(url, params)
results = query_by_name(ARTIST_URL, query_type["simple"], "Nirvana")
print results
Output:
requesting http://musicbrainz.org/ws/2/artist/?query=artist%3ANirvana&fmt=json
{u'count': 14, u'offset': 0, u'artists': [{u'name': u'Nirvana', u'area': {u'sort-name': u'United States', u'id': u'489ce91b-6658-3307-9877-795b68554c98', u'name': u'United States'}, u'country': u'US', u'disambiguation': u'90s US grunge band', u'tags': [{u'count': 1, u'name': u'punk'}, {u'count': 0, u'name': u'legendary'}, {u'count': 0, u'name': u'90'}, {u'count': 1, u'name': u'seattle'}, {u'count': 0, u'name': u'northwest'}, {u'count': 0, u'name': u'alternative'}, {u'count': 0, u'name': u'rock and indie'}, {u'count': 1, u'name': u'usa'}, {u'count': 0, u'name': u'am\xe9ricain'}, {u'count': 0, u'name': u'united states'}, {u'count': 0, u'name': u'kurt cobain'}
........
So, the code requests.get the folllowing url: http://musicbrainz.org/ws/2/artist/
and params: ?query=artist%3ANirvana&fmt=json
. In this case, params is constructed as a dict with some following queries:
params= {'query': 'artist:Nirvana', 'fmt': 'json'}
import json
import requests
url = "http://musicbrainz.org/ws/2/artist/"
params = {}
params['query'] = 'artist:Nirvana'
params['fmt'] = 'json'
results = requests.get(url, params=params)
print results.url
print results.json()
Output:
http://musicbrainz.org/ws/2/artist/?query=artist%3ANirvana&fmt=json
{u'count': 14, u'offset': 0, u'artists': [{u'name': u'Nirvana', u'area': {u'sort-name': u'United States', u'id': u'489ce91b-6658-3307-9877-795b68554c98', u'name': u'United States'}, u'country': u'US', u'disambiguation': u'90s US grunge band', u'tags': [{u'count': 1, u'name': u'punk'}, {u'count': 0, u'name': u'legendary'}, {u'count': 0, u'name': u'90'}, {u'count': 1, u'name': u'seattle'}, {u'count': 0, u'name': u'northwest'}, {u'count': 0, u'name': u'alternative'}, {u'count': 0, u'name': u'rock and indie'}, {u'count': 1, u'name': u'usa'},...
Use this website to understand more about API of musicbrainz
def pretty_print(data, indent=4):
# After we get our output, we can format it to be more readable
# by using this function.
if type(data) == dict:
print json.dumps(data, indent=indent, sort_keys=True)
else:
print data
Copy to Clipboard:
import clipboard
def pretty_copy(data, indent=4):
# After we get our output, we can format it to be more readable
# by using this function.
if type(data) == dict:
temp = json.dumps(data, indent=indent, sort_keys=True)
else:
temp = data
clipboard.copy(temp)
SOLUTION :
import json
import requests
url = "http://musicbrainz.org/ws/2/artist/"
# how many bands named "First Aid Kit"
params = {}
params["query"] = "artist:First Aid Kit"
params["fmt"] = "json"
data = requests.get(url, params=params).json()
count = 0;
for a in data["artists"]:
if a["sort-name"] == "First Aid Kit":
count = count + 1
print(count) #
#############################################################
# begin-area name for queen
params["query"] = "artist:Queen"
data = requests.get(url, params=params).json()
for a in data["artists"]:
if a["sort-name"] == "Queen":
if "begin-area" in a:
print(a["begin-area"]["name"])
# London
#############################################################
# Spanish alias for Beatles?
params["query"] = "artist:Beatles"
data = requests.get(url, params=params).json()
for artist in data["artists"]:
if artist["name"] == "The Beatles":
for alias in artist["aliases"]:
print(alias["locale"], alias["name"])
# one of the output lines will be:
# es Los Beatles
#############################################################
# Nirvana disambiguation
params["query"] = "artist:Nirvana"
data = requests.get(url, params=params).json()
for artist in data["artists"]:
if artist["name"] == "Nirvana":
print(artist["country"], "\t", artist["disambiguation"])
# one of the output lines will be:
# US 90s US grunge band
#############################################################
# when was "One Direction" formed
params["query"] = "artist:One Direction"
data = requests.get(url, params=params).json()
for artist in data["artists"]:
if artist["name"] == "One Direction":
print(artist["life-span"]["begin"])
# 2010-07
f = open("test.json", mode="w")
f.write(json.dumps(data, sort_keys=True, indent=4))
f.close()
response.url # the full URL
response.text # text result
response.json() # result in json format
print(json.dumps(response.json(), sort_keys=True, indent=4))
id = response.json()["artists"][1]["id"]
url = "http://musicbrainz.org/ws/2/artist/" + str(id)
params = {}
params["inc"] = "releases"
params["fmt"] = "json"
response = requests.get(url, params=params)
if response.status_code == requests.codes.ok:
print("request successful")
data = response.json()
data.keys() # there's a key called "releases"
releases = data["releases"] # this is a list, each is a dictionary with "title"
for r in releases:
print(r["title"])
A) INSTRUCTIONS:
Your task is to process the supplied file use the csv module to extract data from it.
Note that the first line of the datafile is neither data entry, nor header. It is a line describing the data source. You should extract the name of the station from it.
The data should be returned as a list of lists (not dictionaries). You can use the csv modules "reader" method to get data in such format. Another useful method is next() - to get the next line from the iterator. You should only change the parse_file function.
QUICK LOOK AT CSV FILE:
GIVEN CODE:
import csv
import os
DATADIR = ""
DATAFILE = "745090.csv"
def parse_file(datafile):
name = ""
data = []
with open(datafile,'rb') as f:
pass
# Do not change the line below
return (name, data)
def test():
datafile = os.path.join(DATADIR, DATAFILE)
name, data = parse_file(datafile)
assert name == "MOUNTAIN VIEW MOFFETT FLD NAS"
assert data[0][1] == "01:00"
assert data[2][0] == "01/01/2005"
assert data[2][5] == "2"
if __name__ == "__main__":
test()
B) SOLUTION:
#!/usr/bin/env python
import csv
import os
DATADIR = ""
DATAFILE = "745090.csv"
def parse_file(datafile):
name = ""
data = []
with open(datafile,'rb') as f:
name = f.next().split(',')[1].strip('"')
f.next()
r = csv.reader(f)
for line in r:
data.append(line)
# Do not change the line below
return (name, data)
def test():
datafile = os.path.join(DATADIR, DATAFILE)
name, data = parse_file(datafile)
assert name == "MOUNTAIN VIEW MOFFETT FLD NAS"
assert data[0][1] == "01:00"
assert data[2][0] == "01/01/2005"
assert data[2][5] == "2"
if __name__ == "__main__":
test()
C) WHAT PROBLEMS DO I HAVE DURING EXTRACT CSV FILES:
A) INSTRUCTIONS:
Find the time and value of max load for each of the regions COAST, EAST, FAR_WEST, NORTH, NORTH_C, SOUTHERN, SOUTH_C, WEST and write the result out in a csv file, using pipe character | as the delimiter.
An example output can be seen in the "example.csv" file.
QUICK LOOK AT EXCEL FILE:
QUICK LOOK AT CSV OUTPUT:
GIVEN CODE:
# -*- coding: utf-8 -*-
import xlrd
import os
import csv
from zipfile import ZipFile
datafile = "2013_ERCOT_Hourly_Load_Data.xls"
outfile = "2013_Max_Loads.csv"
def open_zip(datafile):
with ZipFile('{0}.zip'.format(datafile), 'r') as myzip:
myzip.extractall()
def parse_file(datafile):
workbook = xlrd.open_workbook(datafile)
sheet = workbook.sheet_by_index(0)
data = None
# YOUR CODE HERE
return data
def save_file(data, filename):
# YOUR CODE HERE
def test():
open_zip(datafile)
data = parse_file(datafile)
save_file(data, outfile)
number_of_rows = 0
stations = []
ans = {'FAR_WEST': {'Max Load': '2281.2722140000024',
'Year': '2013',
'Month': '6',
'Day': '26',
'Hour': '17'}}
correct_stations = ['COAST', 'EAST', 'FAR_WEST', 'NORTH',
'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST']
fields = ['Year', 'Month', 'Day', 'Hour', 'Max Load']
with open(outfile) as of:
csvfile = csv.DictReader(of, delimiter="|")
for line in csvfile:
station = line['Station']
if station == 'FAR_WEST':
for field in fields:
# Check if 'Max Load' is within .1 of answer
if field == 'Max Load':
max_answer = round(float(ans[station][field]), 1)
max_line = round(float(line[field]), 1)
assert max_answer == max_line
# Otherwise check for equality
else:
assert ans[station][field] == line[field]
number_of_rows += 1
stations.append(station)
# Output should be 8 lines not including header
assert number_of_rows == 8
# Check Station Names
assert set(stations) == set(correct_stations)
if __name__ == "__main__":
test()
B) SOLUTION:</b>
SOlUTION 1:
#!/usr/bin/env python
datafile = "2013_ERCOT_Hourly_Load_Data.xls"
outfile = "2013_Max_Loads.csv"
def parse_file(datafile):
workbook = xlrd.open_workbook(datafile)
sheet = workbook.sheet_by_index(0)
stations = sheet.row_values(0)[1:sheet.ncols-1]
header = ["Station","Year","Month","Day","Hour","Max Load"]
data = []
data.append(header)
for i in range(len(stations)):
station = stations[i]
station_column = sheet.col_values(i+1, start_rowx=1, end_rowx=None)
max_load = max(station_column)
maxpos = station_column.index(max(station_column))
year, month, day, hour, dump1, dump2 = xlrd.xldate_as_tuple(sheet.cell_value(maxpos+1,0),0)
data.append([station, year,month, day, hour, max_load])
return data
def save_file(data, filename):
with open(filename, "wb") as f:
wr = csv.writer(f,delimiter='|')
wr.writerows(data)
=================================================================================
SOLUTION 2:
def parse_file(datafile):
workbook = xlrd.open_workbook(datafile)
sheet = workbook.sheet_by_index(0)
data = {}
# process all rows that contain station data
for n in range (1, 9):
station = sheet.cell_value(0, n)
cv = sheet.col_values(n, start_rowx=1, end_rowx=None)
maxval = max(cv)
maxpos = cv.index(maxval) + 1
maxtime = sheet.cell_value(maxpos, 0)
realtime = xlrd.xldate_as_tuple(maxtime, 0)
data[station] = {"maxval": maxval,
"maxtime": realtime}
print data
return data
def save_file(data, filename):
with open(filename, "w") as f:
w = csv.writer(f, delimiter='|')
w.writerow(["Station", "Year", "Month", "Day", "Hour", "Max Load"])
for s in data:
year, month, day, hour, _ , _= data[s]["maxtime"]
w.writerow([s, year, month, day, hour, data[s]["maxval"]])
C) WHAT PROBLEMS DO I HAVE DURING PROBLEM EXCEL TO CSV:
data
. List or Dict ? And When I look at the csv output, I think I should write a list instead of dict, so I do not need to convert it to list later.