Section I: Introduction, Configuration and Setup
Section II: Aggregating EIA Data with Python
Section III: Cleaning, Visualization with Pandas, Matplotlib
Section IV: Understanding SVG Files with BeautifulSoup
Section V: Geoplotting State-Level US Data to Create Heatmaps
The EIA website hosts quite a large range of data on energy and the energy industry. From the start, I knew I wanted to find state-level data on renewable energy production and data on residential electricity. However, out of curiousity I created a short codebit that utilizes the API's keyword search call. The call simply returns metadata on all series of data that are relevant to a provided keyword. Using requests.get() and .text to return the text of the data, what is returned is JSON data in string format. This can easily be converted into JSON format with json.loads. Once in JSON format, data can be accessed a la indexing your way down the response's element tree, the same way you would access elements of a list or a dictionary.
Using search_keywords() returns the names of all relevant series, along with their corresponding series IDs. These IDs are used to compile specific data.
searches = {} def search_keywords(): query = raw_input('Search for Keyword: ') if query == 'x' or query == 'q': pass else: base_url = 'http://api.eia.gov/search/?search_term=name&search_value="{}"&rows_per_page={}' r = requests.get(base_url.format(query, '10000')) text = r.text master_dict = json.loads(text) m = master_dict data = m['response'] docs = data['docs'] print('Search Results Found:') for series in docs: print {series['name']:series['series_id']}
Search for Keyword: renewable energy Search Results Found: {u'Renewable energy production, Iowa': u'SEDS.REPRB.IA.A'} {u'Renewable energy production, Ohio': u'SEDS.REPRB.OH.A'} {u'Renewable energy production, Utah': u'SEDS.REPRB.UT.A'} {u'Renewable energy production, Idaho': u'SEDS.REPRB.ID.A'} {u'Renewable energy production, Maine': u'SEDS.REPRB.ME.A'}
df_i = {} def search_series(query, save_as): base_url = 'http://api.eia.gov/search/?search_term=series_id&search_value="{}"&rows_per_page={}' r = requests.get(base_url.format(query, '10000')) text = r.text master_dict = json.loads(text) m = master_dict data = m['response'] docs = data['docs'] print('Search Results Found:') for series in docs: print {series['name']:series['series_id']} save_search = raw_input('Would you like to save these search results? ') if 'y' in save_search.lower(): searches[save_as] = docs series_index = [{x['name'].split(', ')[1] : x['series_id']} for x in searches[save_as]] df = pd.DataFrame(series_index) df = pd.DataFrame(df.stack()) df = df.reset_index() df = df.drop('level_0', axis=1) df_i[save_as] = df.rename(columns={'level_1':'State',0:save_as}) else: pass
factors = [['SEDS.RETCB', 'Renewable Consumption'], ['SEDS.REPRB', 'Renewable Production'], ['SEDS.TEPRB', 'Total Energy Production'], ['SEDS.ESRCD', 'Price of Residential Electricity']]
for factor in factors: query = factor[0] save_as = factor[1] search_series(query, save_as)
#helper functions def query_url(url): r = requests.get(url) text = r.text j = json.loads(text) return j def nav_json(j): data = j['series'] data = data[0] data = data['data'] return data
def compile_state_data(series): series_df = pd.DataFrame() #df for each series try: for state_data in [x for x in df_i[series][series]]: url = 'http://api.eia.gov/series/?series_id={}&api_key={}&out=json'.format(state_data,api_key) j = query_url(url) data = nav_json(j) df_inst = pd.DataFrame(data) #contains year, series data df_inst.columns = ['Year', series] df_inst['Year'] = [datetime.datetime.strptime(str(x), '%Y').year for x in df_inst['Year']] df_inst['State'] = ''.join(df_i[series]['State'][df_i[series][series] == state_data]) df_inst = df_inst[['State', 'Year', series]] if len(series_df) == 0: series_df = df_inst else: #series_df = pd.merge(series_df, df_inst, on=['State','Year'], how='inner') series_df = series_df.append(df_inst, ignore_index=True) return series_df except Exception, e: print 'Error: ', str(e), 'is not a recognized EIA Series ID.'
Total_Energy = compile_state_data('Total Energy Production') Renewable_Energy = compile_state_data('Renewable Production') Res_Electricity_Price = compile_state_data('Price of Residential Electricity')
keys = ['State', 'Year'] df = pd.merge(Total_Energy, Renewable_Energy, on=keys).merge(Res_Electricity_Price, on=keys) df['Renewable, Percentage of Production'] = df['Renewable Production']/df['Total Energy Production']
outFile = open('df.txt', 'wb') pickle.dump(df, outFile) outFile.close()