c-trl | Data Wrangling with Python

Collecting Data on Energy

The EIA website hosts quite a large range of data on energy and the energy industry. From the start, I knew I wanted to find state-level data on renewable energy production and data on residential electricity. However, out of curiousity I created a short codebit that utilizes the API's keyword search call. The call simply returns metadata on all series of data that are relevant to a provided keyword. Using requests.get() and .text to return the text of the data, what is returned is JSON data in string format. This can easily be converted into JSON format with json.loads. Once in JSON format, data can be accessed a la indexing your way down the response's element tree, the same way you would access elements of a list or a dictionary.

Using search_keywords() returns the names of all relevant series, along with their corresponding series IDs. These IDs are used to compile specific data.

searches = {}

def search_keywords():
    query = raw_input('Search for Keyword: ')
    if query == 'x' or query == 'q':
        pass
    else:
        base_url = 'http://api.eia.gov/search/?search_term=name&search_value="{}"&rows_per_page={}'
        r = requests.get(base_url.format(query, '10000'))
        text = r.text
        master_dict = json.loads(text)
        m = master_dict
        data = m['response']
        docs = data['docs']
        print('Search Results Found:')
        for series in docs:
            print {series['name']:series['series_id']}

Search for Keyword: renewable energy
Search Results Found:
{u'Renewable energy production, Iowa': u'SEDS.REPRB.IA.A'}
{u'Renewable energy production, Ohio': u'SEDS.REPRB.OH.A'}
{u'Renewable energy production, Utah': u'SEDS.REPRB.UT.A'}
{u'Renewable energy production, Idaho': u'SEDS.REPRB.ID.A'}
{u'Renewable energy production, Maine': u'SEDS.REPRB.ME.A'}

df_i = {}

def search_series(query, save_as):
    base_url = 'http://api.eia.gov/search/?search_term=series_id&search_value="{}"&rows_per_page={}'
    r = requests.get(base_url.format(query, '10000'))
    text = r.text
    master_dict = json.loads(text)
    m = master_dict
    data = m['response']
    docs = data['docs']
    print('Search Results Found:')
    for series in docs:
        print {series['name']:series['series_id']}
    save_search = raw_input('Would you like to save these search results? ')
    if 'y' in save_search.lower():
        searches[save_as] = docs
        series_index = [{x['name'].split(', ')[1] : x['series_id']} for x in searches[save_as]]
        df = pd.DataFrame(series_index)
        df = pd.DataFrame(df.stack())
        df = df.reset_index()
        df = df.drop('level_0', axis=1)
        df_i[save_as] = df.rename(columns={'level_1':'State',0:save_as})
    else:
        pass

factors = [['SEDS.RETCB', 'Renewable Consumption'],
['SEDS.REPRB', 'Renewable Production'],
['SEDS.TEPRB', 'Total Energy Production'],
['SEDS.ESRCD', 'Price of Residential Electricity']]

for factor in factors:
    query = factor[0]
    save_as = factor[1]
    search_series(query, save_as)

#helper functions

def query_url(url):
    r = requests.get(url)
    text = r.text
    j = json.loads(text)
    return j

def nav_json(j):
    data = j['series']
    data = data[0]
    data = data['data']
    return data

def compile_state_data(series):
    series_df = pd.DataFrame() #df for each series
    try:
        for state_data in [x for x in df_i[series][series]]:
            url = 'http://api.eia.gov/series/?series_id={}&api_key={}&out=json'.format(state_data,api_key)
            j = query_url(url)
            data = nav_json(j)
            df_inst = pd.DataFrame(data) #contains year, series data
            df_inst.columns = ['Year', series]
            df_inst['Year'] = [datetime.datetime.strptime(str(x), '%Y').year for x in df_inst['Year']]
            df_inst['State'] = ''.join(df_i[series]['State'][df_i[series][series] == state_data])
            df_inst = df_inst[['State', 'Year', series]]
            if len(series_df) == 0:
                series_df = df_inst
            else:
                #series_df = pd.merge(series_df, df_inst, on=['State','Year'], how='inner')
                series_df = series_df.append(df_inst, ignore_index=True)
        return series_df
    except Exception, e:
        print 'Error: ', str(e), 'is not a recognized EIA Series ID.'

Total_Energy = compile_state_data('Total Energy Production')
Renewable_Energy = compile_state_data('Renewable Production')
Res_Electricity_Price = compile_state_data('Price of Residential Electricity')

keys = ['State', 'Year']
df = pd.merge(Total_Energy, Renewable_Energy, on=keys).merge(Res_Electricity_Price, on=keys)
df['Renewable, Percentage of Production'] = df['Renewable Production']/df['Total Energy Production']

outFile = open('df.txt', 'wb')
pickle.dump(df, outFile)
outFile.close()

← Back