DATA EXTRACTION

I. HANDLE CSV FILES:

  • Instruction: Parse the first 10 lines of the file Then put each equivalent data in each line into a dict as below:

Dict Format:

{'Title': 'Please Please Me', 'UK Chart Position': '1', 'Label': 'Parlophone(UK)',
 'Released': '22 March 1963', 'US Chart Position': '-', 'RIAA Certification': 'Platinum', 
'BPI Certification': 'Gold'}

Use first line of the file as a header of dict.

  • Solution 1:
def parse_file(datafile):
    data = []
    with open(datafile, "rb") as f:
        header = f.readline().split(",")
        counter = 0
        for line in f:
            if counter == 10:
                break
            fields = line.split(",")
            entry = {}
            for i, value in enumerate(fields):
                entry[header[i].strip()] = value.strip()
            data.append(entry)
            counter += 1
    return data
  • Solution 2:
def parse_file(datafile):
    data = []
    with open(datafile, "r") as f:
        header = f.readline().split(",")
        for line in f:
            line_dict = {}
            line_walk = 0
            for key in header:
                line_dict[key.strip()] = line.split(',')[line_walk].strip()
                line_walk +=1
            data.append(line_dict)
    return data[0:10]
  • Solution 3: USING CSV MODULE
import csv
def parse_file(datafile):
    data = []
    with open(datafile, "r") as sd:
        r = csv.DictReader(sd)
        for line in r:
            data.append(line)
    return data[0:10]

THIS IS USED TO TEST THE FIRST 10 LINE:

import os
DATADIR = ""
DATAFILE = "beatles-diskography.csv"

def test():
    # a simple test of your implemetation
    datafile = os.path.join(DATADIR, DATAFILE)
    d = parse_file(datafile)
    firstline = {'Title': 'Please Please Me', 'UK Chart Position': '1', 'Label': 'Parlophone(UK)', 'Released': '22 March 1963', 'US Chart Position': '-', 'RIAA Certification': 'Platinum', 'BPI Certification': 'Gold'}
    tenthline = {'Title': '', 'UK Chart Position': '1', 'Label': 'Parlophone(UK)', 'Released': '10 July 1964', 'US Chart Position': '-', 'RIAA Certification': '', 'BPI Certification': 'Gold'}
    assert d[0] == firstline
    assert d[9] == tenthline    
test()

II. HANDLE XLS FILES:

  • USING XLRD MODULE : pip install xlrd
import xlrd
datafile = "2013_ERCOT_Hourly_Load_Data.xls"
def parse_file(datafile):
    workbook = xlrd.open_workbook(datafile)
    sheet = workbook.sheet_by_index(0)

    data = [[sheet.cell_value(r, col) 
                for col in range(sheet.ncols)] 
                    for r in range(sheet.nrows)]

    print "\nList Comprehension"
    print "data[3][2]:",
    print data[3][2]

    print "\nCells in a nested loop:"    
    for row in range(sheet.nrows):
        for col in range(sheet.ncols):
            if row == 50:
                print sheet.cell_value(row, col),


    ### other useful methods:
    print "\nROWS, COLUMNS, and CELLS:"
    print "Number of rows in the sheet:", 
    print sheet.nrows
    print "Type of data in cell (row 3, col 2):", 
    print sheet.cell_type(3, 2)
    print "Value in cell (row 3, col 2):", 
    print sheet.cell_value(3, 2)
    print "Get a slice of values in column 3, from rows 1-3:"
    print sheet.col_values(3, start_rowx=1, end_rowx=4)

    print "\nDATES:"
    print "Type of data in cell (row 1, col 0):", 
    print sheet.cell_type(1, 0)
    exceltime = sheet.cell_value(1, 0)
    print "Time in Excel format:",
    print exceltime
    print "Convert time to a Python datetime tuple, from the Excel float:",
    print xlrd.xldate_as_tuple(exceltime, 0)
    return data
data = parse_file(datafile)
  • QUIZ:
"""
Your task is as follows:
- read the provided Excel file
- find and return the min, max and average values for the COAST region
- find and return the time value for the min and max entries
- the time values should be returned as Python tuples

Please see the test function for the expected return format
"""
import numpy as np
import xlrd
from zipfile import ZipFile
datafile = "2013_ERCOT_Hourly_Load_Data.xls"
def open_zip(datafile):
    with ZipFile(re.sub('.xls','','{0}.zip'.format(datafile)), 'r') as myzip:
        myzip.extractall()
def parse_file(datafile):
    workbook = xlrd.open_workbook(datafile)
    sheet = workbook.sheet_by_index(0)
    coast = sheet.col_values(1,start_rowx=1, end_rowx=None)
    maxpos = coast.index(max(coast))
    minpos = coast.index(min(coast))
    data = {
            'maxtime': xlrd.xldate_as_tuple(sheet.cell_value(maxpos+1,0),0),
            'maxvalue': max(coast),
            'mintime': xlrd.xldate_as_tuple(sheet.cell_value(minpos+1,0),0),
            'minvalue': min(coast),
            'avgcoast': np.mean(coast)
    }
    return data
def test():
    open_zip(datafile)
    data = parse_file(datafile)

    assert data['maxtime'] == (2013, 8, 13, 17, 0, 0)
    assert round(data['maxvalue'], 10) == round(18779.02551, 10)
test()

III. HANDLE JSON FILES:

  • IMPORT JSON:

Two most importain functions in Python JSON are : .loads() and .dumps()

.loads() : it converts JSON_string(string type but content looks like JSON) to dict. Purpose: string is dull and non-loop-able, convert to dict for easy management and handling.

-Ex:

json_string = '{"first_name": "Guido", "last_name":"Rossum"}'
import json
parsed_json = json.loads(json_string)
print parsed_json
print "=========================="
print type(parsed_json)
Output:

{u'first_name': u'Guido', u'last_name': u'Rossum'}
==========================
<type 'dict'>

.dumps() : it converts dict to JSON_string(string type but content looks like JSON). Purpose: it does not make much sense if we only convert a dict to string(JSON) and do nothing else; the .dumps() contains some fields such as indent or sort_keys. It helps to print the JSON_string in a customized pretty way.

-Ex:

d = {
    'first_name': 'Guido',
    'second_name': 'Rossum',
    'titles': ['BDFL', 'Developer'],
}
print "==================================== BEFORE ========================================="
print d,"\n"
print "===================================== AFTER ========================================="
def pretty_print(data, indent=4):
    # After we get our output, we can format it to be more readable
    # by using this function.
    if type(data) == dict:
        print json.dumps(data, indent=indent, sort_keys=True)
    else:
        print data
pretty_print(d)
Output:

==================================== BEFORE =========================================
{'first_name': 'Guido', 'titles': ['BDFL', 'Developer'], 'second_name': 'Rossum'} 

===================================== AFTER =========================================
{
    "first_name": "Guido", 
    "second_name": "Rossum", 
    "titles": [
        "BDFL", 
        "Developer"
    ]
}
  • IMPORT REQUESTS:

There are some important things in requests module that we need to focus : requests.get(url, params) , object.status_code == requests.codes.ok , object.json() with object.raise_for_status() . You understand them all by the following examples:

.loads() : it converts JSON_string(string type but content looks like JSON) to dict. Purpose: string is dull and non-loop-able, convert to dict for easy management and handling.

-Ex:

import json
import requests

BASE_URL = "http://musicbrainz.org/ws/2/"
ARTIST_URL = BASE_URL + "artist/"

# query parameters are given to the requests.get function as a dictionary; this
# variable contains some starter parameters.
query_type = {  "simple": {},
                "atr": {"inc": "aliases+tags+ratings"},
                "aliases": {"inc": "aliases"},
                "releases": {"inc": "releases"}}

def query_site(url, params, uid="", fmt="json"):
    # This is the main function for making queries to the musicbrainz API.
    # A json document should be returned by the query.
    params["fmt"] = fmt
    r = requests.get(url + uid, params=params)
    print "params:", params
    print "requesting", r.url

    if r.status_code == requests.codes.ok:
        return r.json()
    else:
        r.raise_for_status()

def query_by_name(url, params, name):
    # This adds an artist name to the query parameters before making
    # an API call to the function above.
    params["query"] = "artist:" + name
    return query_site(url, params)

results = query_by_name(ARTIST_URL, query_type["simple"], "Nirvana")
print results
Output:

requesting http://musicbrainz.org/ws/2/artist/?query=artist%3ANirvana&fmt=json
{u'count': 14, u'offset': 0, u'artists': [{u'name': u'Nirvana', u'area': {u'sort-name': u'United States', u'id': u'489ce91b-6658-3307-9877-795b68554c98', u'name': u'United States'}, u'country': u'US', u'disambiguation': u'90s US grunge band', u'tags': [{u'count': 1, u'name': u'punk'}, {u'count': 0, u'name': u'legendary'}, {u'count': 0, u'name': u'90'}, {u'count': 1, u'name': u'seattle'}, {u'count': 0, u'name': u'northwest'}, {u'count': 0, u'name': u'alternative'}, {u'count': 0, u'name': u'rock and indie'}, {u'count': 1, u'name': u'usa'}, {u'count': 0, u'name': u'am\xe9ricain'}, {u'count': 0, u'name': u'united states'}, {u'count': 0, u'name': u'kurt cobain'}
........

So, the code requests.get the folllowing url: http://musicbrainz.org/ws/2/artist/ and params: ?query=artist%3ANirvana&fmt=json . In this case, params is constructed as a dict with some following queries:

params= {'query': 'artist:Nirvana', 'fmt': 'json'}
  • BASICALLY, WHEN WE WANT TO LEARN JSON, WE SHOULD NOT WRITE ANY FUNCTION SUCH AS query_by_name OR query_by_site IMMEDIATELY. THE FUNCTION CAN MAKE YOUR CODE MORE COMPLETE, BUT IT IS TOO COMPLICATED.
  • THEREFORE, IT IS ANOTHER WAY OF GETTING THE SAME RESULTS
import json
import requests
url = "http://musicbrainz.org/ws/2/artist/"
params = {}
params['query'] = 'artist:Nirvana'
params['fmt'] = 'json'
results = requests.get(url, params=params)
print results.url
print results.json()
Output:

http://musicbrainz.org/ws/2/artist/?query=artist%3ANirvana&fmt=json
{u'count': 14, u'offset': 0, u'artists': [{u'name': u'Nirvana', u'area': {u'sort-name': u'United States', u'id': u'489ce91b-6658-3307-9877-795b68554c98', u'name': u'United States'}, u'country': u'US', u'disambiguation': u'90s US grunge band', u'tags': [{u'count': 1, u'name': u'punk'}, {u'count': 0, u'name': u'legendary'}, {u'count': 0, u'name': u'90'}, {u'count': 1, u'name': u'seattle'}, {u'count': 0, u'name': u'northwest'}, {u'count': 0, u'name': u'alternative'}, {u'count': 0, u'name': u'rock and indie'}, {u'count': 1, u'name': u'usa'},...

Use this website to understand more about API of musicbrainz

  • USEFUL FUNCTION FOR DATA EXPLORATION
def pretty_print(data, indent=4):
    # After we get our output, we can format it to be more readable
    # by using this function.
    if type(data) == dict:
        print json.dumps(data, indent=indent, sort_keys=True)
    else:
        print data
Copy to Clipboard:

import clipboard
def pretty_copy(data, indent=4):
    # After we get our output, we can format it to be more readable
    # by using this function.
    if type(data) == dict:
        temp = json.dumps(data, indent=indent, sort_keys=True)
    else:
        temp = data
    clipboard.copy(temp)
  • EXPLORING JSON QUIZ:

SOLUTION :

import json
import requests

url = "http://musicbrainz.org/ws/2/artist/"

# how many bands named "First Aid Kit"

params = {}
params["query"] = "artist:First Aid Kit"
params["fmt"] = "json"

data = requests.get(url, params=params).json()

count = 0;
for a in data["artists"]:
    if a["sort-name"] == "First Aid Kit":
        count = count + 1

print(count) # 

#############################################################

# begin-area name for queen

params["query"] = "artist:Queen"
data = requests.get(url, params=params).json()

for a in data["artists"]:
    if a["sort-name"] == "Queen":
        if "begin-area" in a:
            print(a["begin-area"]["name"])

# London

#############################################################

# Spanish alias for Beatles?

params["query"] = "artist:Beatles"
data = requests.get(url, params=params).json()

for artist in data["artists"]:
    if artist["name"] == "The Beatles":
        for alias in artist["aliases"]:
            print(alias["locale"], alias["name"])

# one of the output lines will be:
# es Los Beatles

#############################################################

# Nirvana disambiguation

params["query"] = "artist:Nirvana"
data = requests.get(url, params=params).json()

for artist in data["artists"]:
    if artist["name"] == "Nirvana":
        print(artist["country"], "\t", artist["disambiguation"])

# one of the output lines will be:
# US      90s US grunge band

#############################################################

# when was "One Direction" formed

params["query"] = "artist:One Direction"
data = requests.get(url, params=params).json()

for artist in data["artists"]:
    if artist["name"] == "One Direction":
        print(artist["life-span"]["begin"])

# 2010-07



f = open("test.json", mode="w")
f.write(json.dumps(data, sort_keys=True, indent=4))
f.close()

response.url       # the full URL
response.text      # text result
response.json()    # result in json format

print(json.dumps(response.json(), sort_keys=True, indent=4))


id = response.json()["artists"][1]["id"]
url = "http://musicbrainz.org/ws/2/artist/" + str(id)

params = {}
params["inc"] = "releases"
params["fmt"] = "json"

response = requests.get(url, params=params)
if response.status_code == requests.codes.ok:
    print("request successful")


data = response.json()
data.keys()     # there's a key called "releases"
releases = data["releases"] # this is a list, each is a dictionary with "title"

for r in releases:
    print(r["title"])

DATA EXTRACTION PROBLEMS:

PROBLEM 1: USING CSV MODULE

A) INSTRUCTIONS:

Your task is to process the supplied file use the csv module to extract data from it.

Note that the first line of the datafile is neither data entry, nor header. It is a line describing the data source. You should extract the name of the station from it.

The data should be returned as a list of lists (not dictionaries). You can use the csv modules "reader" method to get data in such format. Another useful method is next() - to get the next line from the iterator. You should only change the parse_file function.

QUICK LOOK AT CSV FILE:

GIVEN CODE:

import csv
import os

DATADIR = ""
DATAFILE = "745090.csv"


def parse_file(datafile):
    name = ""
    data = []
    with open(datafile,'rb') as f:
        pass
    # Do not change the line below
    return (name, data)

def test():
    datafile = os.path.join(DATADIR, DATAFILE)
    name, data = parse_file(datafile)

    assert name == "MOUNTAIN VIEW MOFFETT FLD NAS"
    assert data[0][1] == "01:00"
    assert data[2][0] == "01/01/2005"
    assert data[2][5] == "2"


if __name__ == "__main__":
    test()

B) SOLUTION:

#!/usr/bin/env python
import csv
import os

DATADIR = ""
DATAFILE = "745090.csv"


def parse_file(datafile):
    name = ""
    data = []
    with open(datafile,'rb') as f:
        name = f.next().split(',')[1].strip('"')
        f.next()
        r = csv.reader(f)
        for line in r:
            data.append(line)
    # Do not change the line below
    return (name, data)


def test():
    datafile = os.path.join(DATADIR, DATAFILE)
    name, data = parse_file(datafile)

    assert name == "MOUNTAIN VIEW MOFFETT FLD NAS"
    assert data[0][1] == "01:00"
    assert data[2][0] == "01/01/2005"
    assert data[2][5] == "2"


if __name__ == "__main__":
    test()

C) WHAT PROBLEMS DO I HAVE DURING EXTRACT CSV FILES:

  • did not know .next() can return value of the line that it skips.
  • .split(',') returns a list. If an element of this list is already a string, we have to use .strip('"') to remove ""

PROBLEM 2: EXCEL TO CSV:

A) INSTRUCTIONS:

Find the time and value of max load for each of the regions COAST, EAST, FAR_WEST, NORTH, NORTH_C, SOUTHERN, SOUTH_C, WEST and write the result out in a csv file, using pipe character | as the delimiter.

An example output can be seen in the "example.csv" file.


QUICK LOOK AT EXCEL FILE:


QUICK LOOK AT CSV OUTPUT:

GIVEN CODE:

# -*- coding: utf-8 -*-
import xlrd
import os
import csv
from zipfile import ZipFile

datafile = "2013_ERCOT_Hourly_Load_Data.xls"
outfile = "2013_Max_Loads.csv"


def open_zip(datafile):
    with ZipFile('{0}.zip'.format(datafile), 'r') as myzip:
        myzip.extractall()


def parse_file(datafile):
    workbook = xlrd.open_workbook(datafile)
    sheet = workbook.sheet_by_index(0)
    data = None
    # YOUR CODE HERE
    return data

def save_file(data, filename):
    # YOUR CODE HERE


def test():
    open_zip(datafile)
    data = parse_file(datafile)
    save_file(data, outfile)

    number_of_rows = 0
    stations = []

    ans = {'FAR_WEST': {'Max Load': '2281.2722140000024',
                        'Year': '2013',
                        'Month': '6',
                        'Day': '26',
                        'Hour': '17'}}
    correct_stations = ['COAST', 'EAST', 'FAR_WEST', 'NORTH',
                        'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST']
    fields = ['Year', 'Month', 'Day', 'Hour', 'Max Load']

    with open(outfile) as of:
        csvfile = csv.DictReader(of, delimiter="|")
        for line in csvfile:
            station = line['Station']
            if station == 'FAR_WEST':
                for field in fields:
                    # Check if 'Max Load' is within .1 of answer
                    if field == 'Max Load':
                        max_answer = round(float(ans[station][field]), 1)
                        max_line = round(float(line[field]), 1)
                        assert max_answer == max_line

                    # Otherwise check for equality
                    else:
                        assert ans[station][field] == line[field]

            number_of_rows += 1
            stations.append(station)

        # Output should be 8 lines not including header
        assert number_of_rows == 8

        # Check Station Names
        assert set(stations) == set(correct_stations)


if __name__ == "__main__":
    test()

B) SOLUTION:</b>

SOlUTION 1:

#!/usr/bin/env python
datafile = "2013_ERCOT_Hourly_Load_Data.xls"
outfile = "2013_Max_Loads.csv"

def parse_file(datafile):
    workbook = xlrd.open_workbook(datafile)
    sheet = workbook.sheet_by_index(0)
    stations = sheet.row_values(0)[1:sheet.ncols-1]
    header = ["Station","Year","Month","Day","Hour","Max Load"]
    data = []
    data.append(header)
    for i in range(len(stations)):
        station = stations[i]
        station_column = sheet.col_values(i+1, start_rowx=1, end_rowx=None)
        max_load = max(station_column)
        maxpos = station_column.index(max(station_column))
        year, month, day, hour, dump1, dump2 = xlrd.xldate_as_tuple(sheet.cell_value(maxpos+1,0),0)
        data.append([station, year,month, day, hour, max_load])
    return data

def save_file(data, filename):
    with open(filename, "wb") as f:
        wr = csv.writer(f,delimiter='|')
        wr.writerows(data)
=================================================================================
SOLUTION 2:

def parse_file(datafile):
    workbook = xlrd.open_workbook(datafile)
    sheet = workbook.sheet_by_index(0)
    data = {}
    # process all rows that contain station data
    for n in range (1, 9):
        station = sheet.cell_value(0, n)
        cv = sheet.col_values(n, start_rowx=1, end_rowx=None)

        maxval = max(cv)
        maxpos = cv.index(maxval) + 1
        maxtime = sheet.cell_value(maxpos, 0)
        realtime = xlrd.xldate_as_tuple(maxtime, 0)
        data[station] = {"maxval": maxval,
                         "maxtime": realtime}

    print data
    return data

def save_file(data, filename):
    with open(filename, "w") as f:
        w = csv.writer(f, delimiter='|')
        w.writerow(["Station", "Year", "Month", "Day", "Hour", "Max Load"])
        for s in data:
            year, month, day, hour, _ , _= data[s]["maxtime"]
            w.writerow([s, year, month, day, hour, data[s]["maxval"]])

C) WHAT PROBLEMS DO I HAVE DURING PROBLEM EXCEL TO CSV:

  • Don't know which data type I should return to data. List or Dict ? And When I look at the csv output, I think I should write a list instead of dict, so I do not need to convert it to list later.
In [ ]: