# Name: Matthew Herdzik
# Date: December 10th, 2021
# URL: https://xizt-ence.github.io/gentrification/
# Project: A Study of Gentrification and the variability of economic and racial factors in response
#    to shifting housing prices

import pandas as pd

# Abbreviated list used during testing
trial_counties = ['Albany County', 'Allegany County', 'Bronx County', 'Broome County']

# Full list of NYS counties (62)
ny_counties = ['Albany County', 'Allegany County', 'Bronx County', 'Broome County',
 'Cattaraugus County', 'Cayuga County', 'Chautauqua County', 'Chemung County',
 'Chenango County', 'Clinton County', 'Columbia County', 'Cortland County',
 'Delaware County', 'Dutchess County', 'Erie County', 'Essex County',
 'Franklin County', 'Fulton County', 'Genesee County', 'Greene County',
 'Hamilton County', 'Herkimer County', 'Jefferson County', 'Kings County',
 'Lewis County', 'Livingston County', 'Madison County', 'Monroe County',
 'Montgomery County', 'Nassau County', 'New York County', 'Niagara County',
 'Oneida County', 'Onondaga County', 'Ontario County', 'Orange County',
 'Orleans County', 'Oswego County', 'Otsego County', 'Putnam County',
 'Queens County', 'Rensselaer County', 'Richmond County', 'Rockland County',
 'Saratoga County', 'Schenectady County', 'Schoharie County',
 'Schuyler County', 'Seneca County', 'Steuben County', 'St. Lawrence County',
 'Suffolk County', 'Sullivan County', 'Tioga County', 'Tompkins County',
 'Ulster County', 'Warren County', 'Washington County', 'Wayne County',
 'Westchester County', 'Wyoming County', 'Yates County']

# Merging Rent Data over several years into one Data Set
# Data publically available at:
# https://www.rentdata.org/states/new-york/2022

# Rename columns for ease of access
def mr_helper_namer(year, df):
    df.rename(columns = {'0 BR':'0_BR_'+str(year),
                         '1 BR':'1_BR_'+str(year),
                         '2 BR':'2_BR_'+str(year),
                         '3 BR':'3_BR_'+str(year),
                         '4 BR':'4_BR_'+str(year),
                         'Est. Population':'Est.Pop_'+str(year)},
                          inplace = True)
    return df
    
# Iterate through several csvs and combine them    
def merge_rent():
    year = 2006
    file_prefix = 'ny_rentdata_'
    file_suffix = '.csv'
    df_rent = pd.read_csv(file_prefix+str(year)+file_suffix)
    df_rent = mr_helper_namer(year, df_rent)
    year += 1
    while (year < 2020):
        tmp_rent = pd.read_csv(file_prefix+str(year)+file_suffix)
        tmp_rent = mr_helper_namer(year, tmp_rent)
        df_rent = merge(df_rent, tmp_rent, how = 'inner', on = 'County')
        year += 1
    df_rent.to_csv('dtxe.rent.csv')

# Extract columns with an assigned number of bedrooms from all years into a different csv
def partition (br, df):
    y = 2006
    tmp = pd.DataFrame()
    while (y < 2020):    #2021 has been removed due to anomalous data
        tmp[str(y)]=df[str(br)+'_BR_'+str(y)]
        y += 1
    return tmp

def part_main():
    # initialize variables
    df = pd.read_csv('ny_rentdata_merged-v2.csv')
    br = 0
    # iterate through bedrooms [0,5)
    # ouput clean dataset into separate files
    while (br < 5):
        tmp_br = partition(br,df)
        tmp_br.to_csv('dtxe.br'+str(br)+'.csv')
        br += 1
    
    
# Cleaning Crime Dataset
# Data publically available at: 
# https://data.ny.gov/Public-Safety/Index-Crimes-by-County-and-Agency-Beginning-1990/ca8h-8gjq

# Remove unnecessary column names and standardize County naming
def clean_crime():
    df_crime = pd.read_csv('Index_Crimes_by_County_and_Agency__Beginning_1990.csv')
    df_crime = df_crime.loc[:,['County','Year','Index Total','Violent Total', 'Property Total']]
    df_crime['County'] = df_crime['County']+' County'
    df_crime.to_csv('dtxe.crime.csv')
    
# Cleaning Wage Dataset
# Data publically available at:
# https://data.ny.gov/Economic-Development/Quarterly-Census-of-Employment-and-Wages-Annual-Da/shc7-xcbw

# NAICS determines which field of employment is being observed, '0' identifies the total
# Standardize column name for Counties
def clean_wages():
    df_wages = pd.read_csv('Quarterly_Census_of_Employment_and_Wages_Annual_Data__Beginning_2000.csv')
    df_wages = df_wages[df_wages['Year'] > 2005]
    df_wages = df_wages[df_wages['NAICS'] == 0]
    df_wages = df_wages[df_wages['Area Type'] == 'County']
    df_wages = df_wages.loc[:,['Area', 'Year', 'Average Employment', 'Total Wage', 'Annual Average Salary']]
    df_wages.rename(columns = {'Area':'County'}, inplace = True)
    df_wages.to_csv('dtxe.wages.csv')

# Remove unnecessary data columns
# Parse through each county and append it to the main dataframe
def organize_wages():
    df_ow = pd.read_csv('dtxe.wages.csv')
    df_salary = pd.DataFrame()
    for c in ny_counties:
        tmp = df_ow[df_ow['County'] == c]
        tmp = tmp.loc[:,['Year','Annual Average Salary']]
        tmp.rename(columns = {'Annual Average Salary':c}, inplace = True)
        tmp['Year'] = tmp['Year'].astype(str)
        tmp = tmp.transpose()
        tmp.rename(columns=tmp.iloc[0], inplace = True)
        tmp.drop(tmp.index[0], inplace = True)
        df_salary = df_salary.append(tmp)
    df_salary.to_csv('dtxe.salary.csv')

# Remove unnecessary data columns
# Parse through each county and append it to the main dataframe
def organize_employment():
    df_oe = pd.read_csv('dtxe.wages.csv')
    df_employment = pd.DataFrame()
    for c in ny_counties:
        tmp = df_oe[df_oe['County'] == c]
        tmp = tmp.loc[:,['Year','Average Employment']]
        tmp.rename(columns = {'Average Employment':c}, inplace = True)
        tmp['Year'] = tmp['Year'].astype(str)
        tmp = tmp.transpose()
        tmp.rename(columns=tmp.iloc[0], inplace = True)
        tmp.drop(tmp.index[0], inplace = True)
        df_employment = df_employment.append(tmp)
    df_employment.to_csv('dtxe.employment.csv')

# Remove unnecessary data columns
# Parse through each county and append it to the main dataframe 
def organize_crime():
    df_oc = pd.read_csv('dtxe.crime.csv')
    df_oc = df_oc[df_oc['Year'] > 2005]
    df_it = df_oc.groupby(['County','Year'])['Index Total'].sum().reset_index(name="Total Crime")
    df_it = organize_helper('Total Crime', df_it)
    df_it.to_csv('dtxe.totalcrime.csv')

# Helper function for removing unnecessary columns
# Parses through counties separately and then appends it to a main Dataframe
def organize_helper(col_ , df_):
    df = pd.DataFrame()
    for c in ny_counties:
        tmp = df_[df_['County'] == c]
        tmp = tmp.loc[:,['Year', col_ ]]
        tmp.rename(columns = {col_ : c}, inplace = True)
        tmp['Year'] = tmp['Year'].astype(str)
        tmp=tmp.transpose()
        tmp.rename(columns = tmp.iloc[0], inplace = True)
        tmp.drop(tmp.index[0], inplace = True)
        df = df.append(tmp)
    return df
    
# Sends violent crime data to the organizer helper function
def organize_violentcrime():
    df_oc = pd.read_csv('dtxe.crime.csv')
    df_oc = df_oc[df_oc['Year'] > 2005]
    df_it = df_oc.groupby(['County','Year'])['Violent Total'].sum().reset_index(name="Total Violent")
    df_it = organize_helper('Total Violent', df_it)
    df_it.to_csv('dtxe.violentcrime.csv')

# Sends property crime data to the organizer helper function
def organize_propertycrime():
    df_oc = pd.read_csv('dtxe.crime.csv')
    df_oc = df_oc[df_oc['Year'] > 2005]
    df_it = df_oc.groupby(['County','Year'])['Index Total'].sum().reset_index(name="Total Property")
    df_it = organize_helper('Total Property', df_it)
    df_it.to_csv('dtxe.propertycrime.csv')

#
#
#
# End of Data Cleaning Code
#
#
#

# Start of Visualization Production Code

# This is main()
# print("Hello World!")
# merge_rent()
# part-main()
# clean_wages()
# organize_wages()
# organize_employment()
# clean_crime()
# organize_crime()
# organize_violentcrime()
# organize_propertycrime()