import pandas as pd
import sys
import matplotlib.pyplot as plt
import seaborn as sb
import folium as fl
from folium.plugins import HeatMap
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score
from PIL import Image
import geopy.distance
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree
import plotly.express as px

# Option to suppress warnings
import warnings
pd.set_option('mode.chained_assignment', None)
warnings.simplefilter("ignore", UserWarning)

# Function for loading training data, takes either a single station or the start and end number of stations
def load_training_data(first_station: int, last_station=0):
    # Checks values input are integer
    try:
        int(first_station)
        int(last_station)
    except ValueError:
        print("Please input an integer for stations to load")
        sys.exit("Error during load_training_data")
    # If no second station is provided, set last_station to be equal to first_station
    if last_station==0:
        last_station=first_station
    # If the stations are input in incorrectly, output an error
    if last_station<first_station:
        print("Make sure last station is larger or equal to first station")
        sys.exit("Error during load_training_data")
    # Checks if an invalid value is input
    if first_station< 1:
        print("Stations start at 1")
        sys.exit("Error during load_training_data")
    if last_station> 275:
        print("Stations only go up to 275")
        sys.exit("Error during load_training_data")
    #Create an empty dataframe
    df=pd.DataFrame()
    # Iterates over every station
    for i in range(first_station, last_station+1):
        # Enabled all stations to be read if necessary (different format for 1-200 and 201-275)
        if i<=200:
            # Joins the DF with the new loaded station, dropping the index. Stations are added in order of station number.
            df = pd.concat([df, pd.read_csv(f"Train/station_{str(i)}_deploy_full.csv")], ignore_index=True)
        if i>200:
            # Joins the DF with the new loaded station, dropping the index. Stations are added in order of station number.
            df = pd.concat([df, pd.read_csv(f"Train/station_{str(i)}_deploy.csv")], ignore_index=True)
    return df

# Uncomment the below code to test
# load_training_data(201, 275)

# Uncomment the below code to see the datatypes in the dataframe
#load_training_data(200).dtypes

# Takes a dataframe and creates columns to represent weekdays as numbers
def convert_weekdays(df):
    try:
        # Convert weekdays to lowercase, to prevent potential missed data
        df['weekday'] = df['weekday'].str.lower()
        
        # Create a column for each weekday and set the value to either 0 or 1, based on if it meets the weekday criteria 
        df['monday'] = (df['weekday'] == 'monday').astype(int)
        df['tuesday'] = (df['weekday'] == 'tuesday').astype(int)
        df['wednesday'] = (df['weekday'] == 'wednesday').astype(int)
        df['thursday'] = (df['weekday'] == 'thursday').astype(int)
        df['friday'] = (df['weekday'] == 'friday').astype(int)
        df['saturday'] = (df['weekday'] == 'saturday').astype(int)
        df['sunday'] = (df['weekday'] == 'sunday').astype(int)
        
        # Drop the weekday column
        df.drop(columns=['weekday'], inplace=True)
        
        # Fill the NaN values in case an entry didn't have a weekday in it
        df[['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday' ]] = df[['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday' ]].fillna(0)
    # If weekdays cannot be found, it outputs an error as all stations should contain this value
    except KeyError:
        print("DataFrame does not contain weekday column")
        sys.exit("Error during convert_weekdays")
    return df

# Uncomment the below code to test the above function
# convert_weekdays(load_training_data(200))

# Takes a dataframe and normalises the bike profile variables
def normalise_bike_profiles(df):
    try:
        # Divides each column value by the number of docks and stores it in-place
        df['bikes_3h_ago'] = df['bikes_3h_ago']/df['numDocks']
        df['full_profile_3h_diff_bikes'] = df['full_profile_3h_diff_bikes']/df['numDocks']
        df['full_profile_bikes'] = df['full_profile_bikes']/df['numDocks']
        df['short_profile_3h_diff_bikes'] = df['short_profile_3h_diff_bikes']/df['numDocks']
        df['short_profile_bikes'] = df['short_profile_bikes']/df['numDocks']
    # If a column isn't found it returns an error, as all stations should have these values
    except KeyError:
        print("DataFrame does not contain bike profiles columns")
        sys.exit("Error during normalise_bike_profiles")
    return df

# Uncomment the below code to test the above function
# normalise_bike_profiles(load_training_data(200))

# Takes a dataframe and normalises the bikes
def normalise_bikes(df):
    try:
        # Assumes there are no 0 docks, as that would be a redundant station.
        df['bikes'] = df['bikes']/df['numDocks']
    # If a column isn't found it returns an error, as all stations should have these values
    except KeyError:
        print("DataFrame does not contain bike or docks column(s)")
        sys.exit("Error during normalise_bikes")
    return df

# Uncomment the below code to test the above function
# normalise_bikes(load_training_data(200))

def handle_na(df):
    return df.dropna()

# Uncomment the below code to test the above function
#handle_na(load_training_data(200))

# Function to perform all pre-processing steps at once
def preprocessing(df, handlena=True):
    if handlena:
        df = handle_na(df)
    df = convert_weekdays(df)
    df = normalise_bike_profiles(df)
    df = normalise_bikes(df)
    return df

# Uncomment the below code to test the above function
# preprocessing(load_training_data(200))

# Function to get the correlation of the dataframe, must have a dataframe passed to it. If True is also passed it will output the matrix and save it
def correlation_matrix(df, show=False):
    # Uses pandas corr() to get correlation matrix
    correlation=df.corr()
    # If the function is called to show the function
    if show:
        # Sets figure size
        plt.figure(figsize=(20, 10))
        # Plots the heatmap, with 1 and -1 as boundaries and annotations. FMT rounds the values in the figure
        plot=sb.heatmap(correlation,  cmap="YlGnBu",vmin=-1, vmax=1, annot=True, fmt=".2f")
        # Adds a title
        plot.set_title('Correlation Heatmap', fontdict={'fontsize':18}, pad=12)
        # Saves the map as heatmap.png for later viewing.
        plt.savefig('heatmap.png', dpi=300, bbox_inches='tight')
    return correlation

# Uncomment this code to test the above functions
# correlation_matrix(convert_weekdays(load_training_data(201,275)), True)

# Function to get numeric descriptions of the dataframe, must have a dataframe passed to it. If True is also passed it will output the data and save it
def describe_dataframe(df, show=False):
    # Uses pandas describe to get numerical values
    description=df.describe(include='all')
    # If the function is called to show the function
    if show:
        # Outputs the dataframe
        print(description)
        # Saves it to a CSV
        description.to_csv("description.csv")
    return description

# Uncomment this code to test the above functions
# describe_dataframe(convert_weekdays(load_training_data(201,275)), True)

# Function to get averages for stations in the dataframe, must have a dataframe passed to it. If True is also passed it will output the data and save it
def average_bikes(df, show=False):
    # Uses pandas groupby and mean to get the averages for each station
    average=df.groupby('station').mean()
    if show:
        # Outputs the dataframe
        print(average)
        # Saves it to a CSV
        average.to_csv("average.csv")
    return average

# Uncomment this code to test the above functions
# average_bikes(convert_weekdays(load_training_data(201,275)), True)

# Function to get show a geographical heatmap for stations in the dataframe, must have a dataframe passed to it. If True is also passed it will output the map and save it
def geographic_view(df, show=False):   
    # Creates a map in Valencia
    map_view = fl.Map(location=[39.46975, -0.37739], zoom_start=12)
    
    # Adds markers to the heatmap for every station
    for index, row in df.iterrows():
        fl.Marker([row['latitude'], row['longitude']], popup=f"Average bikes: {row['bikes']}").add_to(map_view)
    
    # Creates heatmap data for the markers, using green to red for high to low values
    heat_data = [[row['latitude'], row['longitude'], row['bikes']] for index, row in df.iterrows()]
    HeatMap(heat_data, gradient={0.3: 'green', 0.6: 'yellow', 0.9:'red'}, radius=30, blur=10).add_to(map_view)
    
    if show:
        # Saves the map
        map_view.save('bike_rental_map.html')
        display(map_view)
    return map_view

# Uncomment this code to test the above functions
# geographic_view(average_bikes(preprocessing(load_training_data(201,275)), False))

# Function to perform all data_analysis steps at once
def data_analysis(df, show=False):
    correlation_matrix(df, show)
    describe_dataframe(df, show)
    geographic_view(average_bikes(df, show), show)

# Uncomment the below code to test the above function
data_analysis(preprocessing(load_training_data(201, 275)), True)

            station      latitude     longitude     numDocks     timestamp  \
count  42600.000000  42600.000000  42600.000000  42600.00000  4.260000e+04   
mean     238.000000     39.471010     -0.372940     19.56000  1.413765e+09   
std       21.648965      0.013061      0.023094      5.70209  5.950482e+05   
min      201.000000     39.444093     -0.409316     14.00000  1.412730e+09   
25%      219.000000     39.462808     -0.393181     15.00000  1.413251e+09   
50%      238.000000     39.469844     -0.375973     20.00000  1.413765e+09   
75%      257.000000     39.480591     -0.354640     20.00000  1.414275e+09   
max      275.000000     39.495034     -0.328902     40.00000  1.414793e+09   

          year    month           day         hour      weekhour  ...  \
count  42600.0  42600.0  42600.000000  42600.00000  42600.000000  ...   
mean    2014.0     10.0     19.610915     11.59331     84.593310  ...   
std        0.0      0.0      6.873305      6.87920     46.089231  ...   
min     2014.0     10.0      8.000000      0.00000      1.000000  ...   
25%     2014.0     10.0     14.000000      6.00000     48.000000  ...   
50%     2014.0     10.0     20.000000     12.00000     85.000000  ...   
75%     2014.0     10.0     26.000000     18.00000    121.000000  ...   
max     2014.0     10.0     31.000000     23.00000    168.000000  ...   

       short_profile_3h_diff_bikes  short_profile_bikes         bikes  \
count                 42600.000000         42600.000000  42600.000000   
mean                      0.000004             0.392049      0.395875   
std                       0.219118             0.286342      0.318846   
min                      -1.000000             0.000000      0.000000   
25%                      -0.087500             0.140351      0.100000   
50%                       0.000000             0.350000      0.333333   
75%                       0.088889             0.600000      0.666667   
max                       1.000000             1.000000      1.000000   

             monday       tuesday     wednesday      thursday        friday  \
count  42600.000000  42600.000000  42600.000000  42600.000000  42600.000000   
mean       0.126761      0.125000      0.163732      0.167254      0.165493   
std        0.332708      0.330723      0.370037      0.373206      0.371629   
min        0.000000      0.000000      0.000000      0.000000      0.000000   
25%        0.000000      0.000000      0.000000      0.000000      0.000000   
50%        0.000000      0.000000      0.000000      0.000000      0.000000   
75%        0.000000      0.000000      0.000000      0.000000      0.000000   
max        1.000000      1.000000      1.000000      1.000000      1.000000   

           saturday        sunday  
count  42600.000000  42600.000000  
mean       0.126761      0.125000  
std        0.332708      0.330723  
min        0.000000      0.000000  
25%        0.000000      0.000000  
50%        0.000000      0.000000  
75%        0.000000      0.000000  
max        1.000000      1.000000  

[8 rows x 31 columns]
          latitude  longitude  numDocks     timestamp    year  month  \
station                                                                
201      39.478178  -0.383541      27.0  1.413765e+09  2014.0   10.0   
202      39.479857  -0.379839      15.0  1.413765e+09  2014.0   10.0   
203      39.476715  -0.375433      25.0  1.413765e+09  2014.0   10.0   
204      39.474323  -0.370021      25.0  1.413765e+09  2014.0   10.0   
205      39.470784  -0.382701      15.0  1.413765e+09  2014.0   10.0   
...            ...        ...       ...           ...     ...    ...   
271      39.463440  -0.409316      20.0  1.413765e+09  2014.0   10.0   
272      39.490257  -0.406462      18.0  1.413765e+09  2014.0   10.0   
273      39.494136  -0.406020      16.0  1.413765e+09  2014.0   10.0   
274      39.480591  -0.332281      15.0  1.413765e+09  2014.0   10.0   
275      39.450273  -0.333363      15.0  1.413765e+09  2014.0   10.0   

               day      hour  weekhour  isHoliday  ...  \
station                                            ...   
201      19.610915  11.59331  84.59331   0.082746  ...   
202      19.610915  11.59331  84.59331   0.082746  ...   
203      19.610915  11.59331  84.59331   0.082746  ...   
204      19.610915  11.59331  84.59331   0.082746  ...   
205      19.610915  11.59331  84.59331   0.082746  ...   
...            ...       ...       ...        ...  ...   
271      19.610915  11.59331  84.59331   0.082746  ...   
272      19.610915  11.59331  84.59331   0.082746  ...   
273      19.610915  11.59331  84.59331   0.082746  ...   
274      19.610915  11.59331  84.59331   0.082746  ...   
275      19.610915  11.59331  84.59331   0.082746  ...   

         short_profile_3h_diff_bikes  short_profile_bikes     bikes    monday  \
station                                                                         
201                         0.001712             0.143660  0.163993  0.126761   
202                         0.004646             0.233382  0.259859  0.126761   
203                        -0.000035             0.165352  0.163662  0.126761   
204                         0.001174             0.223867  0.278380  0.126761   
205                         0.002944             0.220100  0.261972  0.126761   
...                              ...                  ...       ...       ...   
271                         0.000587             0.325572  0.273063  0.126761   
272                         0.000807             0.388180  0.573063  0.126761   
273                         0.001284             0.357220  0.451364  0.126761   
274                        -0.002690             0.762109  0.707864  0.126761   
275                        -0.002543             0.712500  0.720540  0.126761   

         tuesday  wednesday  thursday    friday  saturday  sunday  
station                                                            
201        0.125   0.163732  0.167254  0.165493  0.126761   0.125  
202        0.125   0.163732  0.167254  0.165493  0.126761   0.125  
203        0.125   0.163732  0.167254  0.165493  0.126761   0.125  
204        0.125   0.163732  0.167254  0.165493  0.126761   0.125  
205        0.125   0.163732  0.167254  0.165493  0.126761   0.125  
...          ...        ...       ...       ...       ...     ...  
271        0.125   0.163732  0.167254  0.165493  0.126761   0.125  
272        0.125   0.163732  0.167254  0.165493  0.126761   0.125  
273        0.125   0.163732  0.167254  0.165493  0.126761   0.125  
274        0.125   0.163732  0.167254  0.165493  0.126761   0.125  
275        0.125   0.163732  0.167254  0.165493  0.126761   0.125  

[75 rows x 30 columns]

# Function that takes a dataframe and optional cutoff point
# Returns all features that have a correlation score above the cutofff point
def feature_filtering(df, cutoff=0.5):
    correlation = correlation_matrix(df, False)
    # uses absolute value to allow for negative correlation
    correlation = correlation[abs((correlation['bikes'])>=cutoff)]
    # Drops the bikes index
    correlation = correlation.drop(['bikes'])
    return correlation.index.tolist()

# # Uncomment the below code for testing
# feature_filtering(preprocessing(load_training_data(201, 275)))

def feature_wrapping(df, show=False):
    # Creates test and training data out of the passed dataframes
    X = df.drop(columns=['bikes'])
    y = df['bikes']
    X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=1)
    
    # Creates a forward selection using the mlxtend library
    forward = SFS(LinearRegression(), k_features=(1, X.shape[1]), forward=True, floating=False, scoring = 'r2', cv = 0)
    
    # Creates a scalar pipeline to go through each feature
    pipe = make_pipeline(StandardScaler(), forward)
    
    # Fits the pipeline to the data
    pipe.fit(X_train, y_train)
    
    #Outputs a graph of the feature results
    if show:
        fig = plot_sfs(forward.get_metric_dict(), kind='std_dev', figsize=(20,10))
        plt.title('Sequential Forward Selection (w. StdDev)')
        plt.grid()
        plt.show()
        plt.savefig('forward.png', dpi=300, bbox_inches='tight')

    # Goes through each result of the forward selector, identifying the last accuracy score
    # Then checks if the next result has an accuracy score is at least 0.01 bigger
    # this prevents the model from being too greedy with the amount of features.
    feature_indx=[]
    features=[]
    last_avg_score=0
    results_dict=forward.get_metric_dict()
    for i in results_dict:
        if round(results_dict[i]['avg_score'],2) > last_avg_score:
            last_avg_score=round(results_dict[i]['avg_score'],2)
            feature_indx=results_dict[i]['feature_idx']
    for i in feature_indx:
        features.append(X.columns[i])
    return features

# Uncomment the below code for testing
# feature_wrapping(preprocessing(load_training_data(201, 275)), True)

# A function that combines the results of both feature selection methods
# If greedy = true (default) it combines the union of the lists
# else it returns the intersect
def feature_selection(df, greedy=True, cutoff=0.5):
    wrapping = feature_wrapping(df)
    filtering = feature_filtering(df, cutoff)
    if greedy:
        features=list(set(wrapping+filtering))
    else:
        features = list(set(wrapping) & set(filtering))
    return features

# Uncomment the below code for testing
# feature_selection(preprocessing(load_training_data(201, 275)), False)
# feature_selection(preprocessing(load_training_data(201, 275)))

# Defines the function which takes:
# a dataframe for testing (required)
# an approach type (optional), 'single' trains the model on all training data and 'separate' trains the model on each station
# a feature selection type (optional), 'filtering' implements feature filtering, 'wrapping' implements feature wrapping, 'combination' implements the union of both, 'combination-intersect' implements the intersect of both, and 'lasso' implements embedded feature selection
# a cutoff value (optional), this modifies the correlation levels required for feature filtering

def modelling(df, approach="single", selection="filtering", cutoff=0.5):
    # Reads the csv file of test data
    test=pd.read_csv('test.csv')
    # Creates a bikes column to enable preprocessing
    test['bikes']=0
    # Preprocesses to vectorise days of the week, but does not drop NA values
    test=preprocessing(test, False)
    # For every approach, the features are generated using the respective feature selection method
    # then the model is created and fit to the training dataframe using only the features and the bikes column
    # the model_results are then created by predicting using the model and the selected features
    if approach=="single":
        if selection=="filtering":
            features=feature_filtering(df, cutoff=cutoff)
            model=LinearRegression()
            model.fit(df[features], df['bikes'])
        elif selection=="wrapping":
            features=feature_wrapping(df)
            model=LinearRegression()
            model.fit(df[features], df['bikes'])
        elif selection=="combination":
            features=feature_selection(df, cutoff=cutoff)
            model=LinearRegression()
            model.fit(df[features], df['bikes'])
        elif selection=="combination-intersect":
            features=feature_selection(df, False)
            model=LinearRegression()
            model.fit(df[features], df['bikes'])
        elif selection=="lasso":
            features=list(df.columns)
            features.remove('bikes')
            model=Lasso()
            model.fit(df[features], df['bikes'])
        else:
            print("Incorrect selection type")
            sys.exit("Error during modelling")
        model_results = model.predict(test[features])
    elif approach=="separate":
        model_results=[]
        # For every station in the test data it creates a model
        for station in list(test['station'].unique()):
            station_df=df[(df['station']==station)]
            if selection=="filtering":
                features=feature_filtering(station_df, cutoff=cutoff)
                model=LinearRegression()
                model.fit(station_df[features], station_df['bikes'])
            elif selection=="wrapping":
                features=feature_wrapping(station_df)
                model=LinearRegression()
                model.fit(station_df[features], station_df['bikes'])
            elif selection=="combination":
                features=feature_selection(station_df, cutoff=cutoff)
                model=LinearRegression()
                model.fit(station_df[features], station_df['bikes'])
            elif selection=="combination-intersect":
                features=feature_selection(station_df, False)
                model=LinearRegression()
                model.fit(station_df[features], station_df['bikes'])
            elif selection=="lasso":
                features=list(station_df.columns)
                features.remove('bikes')
                model=Lasso()
                model.fit(station_df[features], station_df['bikes'])
            else:
                print("Incorrect selection type")
                sys.exit("Error during modelling")
            station_test=test[(test['station']==station)]
            model_results.extend(model.predict(station_test[features]))
    else:
        print("Incorrect approach type")
        sys.exit("Error during modelling")

    # Creates a dataframe from the results, then turns the % of bikes to real numbers and rounds to 0 to make the values whole numbers
    # The results are output as a dataframe stating the type of model created 
    results_df = pd.DataFrame({
        'Id': list(test['Id']),
        'bikes': model_results
    })
    results_df['bikes']=results_df['bikes']*test['numDocks']
    results_df = results_df.set_index('Id')
    results_df = results_df.round(0)
    results_df.to_csv(f'submission_{approach}_{selection}.csv')

# Uncomment the below code to test the above function
# modelling(preprocessing(load_training_data(201, 275)))

#modelling(preprocessing(load_training_data(201, 275)))
#modelling(preprocessing(load_training_data(201, 275)), selection='wrapping')
#modelling(preprocessing(load_training_data(201, 275)), selection='combination-intersect')
#modelling(preprocessing(load_training_data(201, 275)), selection='combination')
#modelling(preprocessing(load_training_data(201, 275)), selection='lasso')
#modelling(preprocessing(load_training_data(201, 275)), approach='separate', cutoff=0.3)
#modelling(preprocessing(load_training_data(201, 275)), approach='separate', selection='wrapping')
#modelling(preprocessing(load_training_data(201, 275)), approach='separate', selection='combination', cutoff=0.2)
# The below model cannot run on combined without a cutoff of 0
#modelling(preprocessing(load_training_data(201, 275)), approach='separate', selection='combination-intersect', cutoff=0)
#modelling(preprocessing(load_training_data(201, 275)), approach='separate', selection='lasso')

# Results manually input from kaggle competition MAE
results=pd.DataFrame({
    'Index': ['single', 'separate'],
    'Filtering': [2.656, 5.45422],
    'Wrapping': [2.33777, 1286.77866],
    'Combination': [2.36533, 1294.01244],
    'Combination-intersect': [2.61244, None],
    'LASSO': [5.37866, 8.62933]
})
results.set_index('Index', inplace=True)
display(results)

# The below image can be accessed through the datalore 'statistics' section of the above viewed dataframe 
img = Image.open('phase1stats.png')
display(img)

# The function takes no arguments and returns a dictionary of dictionaries containing dataframes
def model_loader():
    # Creates an empty dictionary
    models=dict()
    # For every station (as they are numbered 1 to 200) create a dictionary using the station number, then store each csv in it
    # the dictionary's first key is the station, and second key is the model's name from the csv.
    for i in range(1, 201):
        models[i]=dict()
        models[i]['full']=pd.read_csv(f'Models/model_station_{i}_rlm_full.csv')
        models[i]['full_temp']=pd.read_csv(f'Models/model_station_{i}_rlm_full_temp.csv')
        models[i]['short']=pd.read_csv(f'Models/model_station_{i}_rlm_short.csv')
        models[i]['short_temp']=pd.read_csv(f'Models/model_station_{i}_rlm_short_temp.csv')
        models[i]['short_full']=pd.read_csv(f'Models/model_station_{i}_rlm_short_full.csv')
        models[i]['short_full_temp']=pd.read_csv(f'Models/model_station_{i}_rlm_short_full_temp.csv')
    return models

# Uncomment the below code for testing
# model_loader()

# This function takes a list of dataframes and returns a single dataframe
def cumulative_model(models_list):
    try:
        model_values=pd.DataFrame(columns=['feature', 'weight'])
        model_values['feature']=models_list[0]['feature']
        model_values['weight']=models_list[0]['weight']*0
        proportion=1/len(models_list)
        for model in models_list:
            model_values['weight']+=model['weight']*proportion
        return model_values
    except KeyError:
        print("One or more models are not in correct format")
        sys.exit("Error cumulative_model")

# Uncomment the below code to test the above function
# cumulative_model([model_loader()[1]['full'], model_loader()[2]['full']]))

# As this function is specifically for the pretrained models, it implements model_loader rather than having parameters
def cumulative_pretrained_models():
    # Loads the pre-trained models and creates an empty list for the new models
    models = model_loader()
    models_list = dict()
    # Essentially transposes the models array, setting the model type as the key and the station specific models as the inner dictionary
    for i in models[1].keys():
         models_list[i] = [v.get(i, None) for v in models.values()]
         
    # Creates an empty dictionary for the cumulative models
    cum_models_list=dict()
    # For every model type (full, short, etc.) pass the list of models to cumulative_models and store the resulting model in the dictionary, using model name as the key
    for i in models_list:
        cum_models_list[i]=cumulative_model(models_list[i])
    return cum_models_list

# Uncomment the below code to test the above function
# cumulative_pretrained_models()

# This function takes a dataframe and returns the features, coefficients, and intercept of the pre-trained model passed to it
def get_model_details(df):
    try:
        # The features are calculated from the feature column, then intercept is omitted 
        features = list(df['feature'])
        features.remove('(Intercept)')
        # The coefficients are produced from the weight column, with the first value representing the intercept
        coefficients = df['weight'].to_numpy()
        intercept= coefficients[0]
        coefficients=coefficients[1:]
    # If the dataframe does not contain features and weights, an error is output
    except KeyError:
        print("One or more pre-trained models are not in correct format")
        sys.exit("Error in get_model_details")
    return features, coefficients, intercept

# Uncomment the below code to test the above function
# get_model_details(pd.DataFrame({'feature':['(Intercept)','Test'], 'weight': [1,1]}))

# This function takes the coefficients and intercept for a model, then returns a LinearRegression object with those values
def custom_linearmodel(coefficients, intercept):
    reg=LinearRegression()
    reg.intercept_=intercept
    reg.coef_= coefficients
    return reg

# Uncomment the below code to test the above function
# custom_linearmodel([1,2],3)

# The function takes no arguments and returns a dictionary of each station's optimum model
def model_selector():
    # Get all training stations
    df = preprocessing(load_training_data(201, 275))
    # Creates an empty dictionary and loads the pre-trained models using the model_loader
    station_models=dict()
    models=model_loader()
    # For every station in the dataframe, the function finds the optimal model
    for station in list(df['station'].unique()):
        # Sets the current stations dataframe
        station_df=df[(df['station']==station)]
        # Sets the initial r2 value, this is negative incase the best model still has a bad fit
        r2 = -100
        # Creates an empty dataframe for the model to be stored in
        station_models[station]=pd.DataFrame()
        # Goes through every pretrained station and every model within it
        for pretrained_station in models:
            for model in models[pretrained_station]:
                # Gets the features, coefficients, and intercept of the model 
                features, coefficients, intercept = get_model_details(models[pretrained_station][model])
                # Gets the features to predict
                station_df_x=station_df[features]
                # Gets the target values for scoring
                station_df_y=station_df['bikes']
                # predicts using a custom model on the values gathered above
                y_pred = custom_linearmodel(coefficients, intercept).predict(station_df_x)
                # Sets a new r2 score, then checks if this is higher than r2. If the score is higher, then new model is stored and r2 is updated
                new_r2 = r2_score(station_df_y, y_pred)
                if new_r2 > r2:
                    r2=new_r2
                    station_models[station]=models[pretrained_station][model]
    return station_models

# Uncomment the below code to test the above function
# model_selector(preprocessing(load_training_data(201, 275)))

# The function has two optional arguments, and doesn't return any values; instead, the results are output as a csv file. 
def pretrained_modelling(mode="cumulative", cum_type='full'):
    # Reads the csv file of test data
    test=pd.read_csv('test.csv')
    # Creates a bikes column to enable preprocessing
    test['bikes']=0
    # Preprocesses to vectorise days of the week, but does not drop NA values
    test=preprocessing(test, False)
    
    # If the mode is cumulative, it loads the cumulative pretrained models and uses the specific version to predict the test data
    if mode=='cumulative':
        models= cumulative_pretrained_models()
        features, coefficients, intercept = get_model_details(models[cum_type])
        model = custom_linearmodel(coefficients, intercept)
        model_results= model.predict(test[features])
    # If the mode is specific, model_selector is called and then every station in test.csv is predicted based on the optimal model 
    elif mode=='specific':
        model_results=[]
        models=model_selector()
        for station in list(test['station'].unique()):
            station_test=test[(test['station']==station)]
            features, coefficients, intercept = get_model_details(models[station])
            model = custom_linearmodel(coefficients, intercept)
            model_results.extend(model.predict(station_test[features]))
    else:
        print("Incorrect mode input")
        sys.exit("Error during pretrained_modelling")

    # Creates a dataframe from the results, then turns the % of bikes to real numbers and rounds to 0 to make the values whole numbers
    # The results are output as a dataframe stating the type of model created 
    results_df = pd.DataFrame({
        'Id': list(test['Id']),
        'bikes': model_results
    })
    results_df['bikes']=results_df['bikes']*test['numDocks']
    results_df = results_df.set_index('Id')
    results_df = results_df.round(0)
    results_df.to_csv(f'submission_{mode}_{cum_type}.csv')

# Uncomment the below code to test the above function
# pretrained_modelling('specific')

#pretrained_modelling(cum_type='full')
#pretrained_modelling(cum_type='full_temp')
#pretrained_modelling(cum_type='short')
#pretrained_modelling(cum_type='short_temp')
#pretrained_modelling(cum_type='short_full')
#pretrained_modelling(cum_type='short_full_temp')
#pretrained_modelling('specific')

# Results manually input from kaggle competition MAE
results=pd.DataFrame({
    'Index': ['Results'],
    'Cumulative full': [3.52444],
    'Cumulative full-temp': [4.54844],
    'Cumulative short': [12.27466],
    'Cumulative short-temp': [13.72177],
    'Cumulative short-full': [3.77777],
    'Cumulative short-full-temp': [4.62222],
    'Cumulative specific': [2.23377]
})
results.set_index('Index', inplace=True)
display(results)

# The function takes no arguments and returns a dictionary of each station's optimum model
def improved_model_selector():
    # Get all training stations
    df = preprocessing(load_training_data(201, 275))
    # Creates an empty dictionary and loads the pre-trained models using the model_loader
    station_models=dict()
    models=model_loader()

    #Performs wrapping cumulatively
    wrapping_features=feature_wrapping(df)
    
    # For every station in the dataframe, the function finds the optimal model
    for station in list(df['station'].unique()):
        # Sets the current stations dataframe
        station_df=df[(df['station']==station)]
        # Sets the initial r2 value, this is negative incase the best model still has a bad fit
        r2 = -100
        # Creates an empty dataframe for the model to be stored in
        station_models[station]=pd.DataFrame()
        # Goes through every pretrained station and every model within it
        for pretrained_station in models:
            for model in models[pretrained_station]:
                # Gets the features, coefficients, and intercept of the model 
                features, coefficients, intercept = get_model_details(models[pretrained_station][model])
                # Gets the features to predict
                station_df_x=station_df[features]
                # Gets the target values for scoring
                station_df_y=station_df['bikes']
                # predicts using a custom model on the values gathered above
                y_pred = custom_linearmodel(coefficients, intercept).predict(station_df_x)
                # Sets a new r2 score, then checks if this is higher than r2. If the score is higher, then new model is stored and r2 is updated
                new_r2 = r2_score(station_df_y, y_pred)
                if new_r2 > r2:
                    r2=new_r2
                    station_models[station]=models[pretrained_station][model]
        
        # Then check wrapping, taken from Phase 2 modelling
        model=LinearRegression()
        station_df_x=station_df[wrapping_features]
        station_df_y=station_df['bikes']
        model.fit(station_df[wrapping_features], station_df['bikes'])
        y_pred = model.predict(station_df_x)
        new_r2 = r2_score(station_df_y, y_pred)
        if new_r2 > r2:
            r2=new_r2
            # Added way to convert model to values, could be stand-alone function
            features_list = ['(Intercept)']
            features_list.extend(wrapping_features)
            coef_list = [model.intercept_]
            coef_list.extend(list(model.coef_))
            station_models[station]=pd.DataFrame({'feature': features_list,
                                                  'weight': coef_list
            })
    return station_models

# Uncomment the below code to test the above function
# improved_model_selector()

# The function has two optional arguments, and doesn't return any values; instead, the results are output as a csv file. 
def improved_pretrained_modelling():
    # Reads the csv file of test data
    test=pd.read_csv('test.csv')
    # Creates a bikes column to enable preprocessing
    test['bikes']=0
    # Preprocesses to vectorise days of the week, but does not drop NA values
    test=preprocessing(test, False)

    # If the mode is cumulative, it loads the cumulative pretrained models and uses the specific version to predict the test data
    # If the mode is specific, model_selector is called and then every station in test.csv is predicted based on the optimal model 
    model_results=[]
    models=improved_model_selector()
    for station in list(test['station'].unique()):
        station_test=test[(test['station']==station)]
        features, coefficients, intercept = get_model_details(models[station])
        model = custom_linearmodel(coefficients, intercept)
        model_results.extend(model.predict(station_test[features]))
    # Creates a dataframe from the results, then turns the % of bikes to real numbers and rounds to 0 to make the values whole numbers
    # The results are output as a dataframe stating the type of model created 
    results_df = pd.DataFrame({
        'Id': list(test['Id']),
        'bikes': model_results
    })
    results_df['bikes']=results_df['bikes']*test['numDocks']
    results_df = results_df.set_index('Id')
    results_df = results_df.round(0)
    results_df.to_csv(f'submission_improve_specific_full.csv')

# Uncomment the below code to test the above function
# improved_pretrained_modelling()

display(3.26)

3.26

geographic_view(average_bikes(preprocessing(load_training_data(1,275)), True))

          latitude  longitude  numDocks     timestamp    year  month  \
station                                                                
1        39.476803  -0.380379      20.0  1.413454e+09  2014.0   10.0   
2        39.476863  -0.371231      15.0  1.413454e+09  2014.0   10.0   
3        39.472766  -0.384174      15.0  1.413454e+09  2014.0   10.0   
4        39.474840  -0.379276      20.0  1.413454e+09  2014.0   10.0   
5        39.474288  -0.375175      23.0  1.413454e+09  2014.0   10.0   
...            ...        ...       ...           ...     ...    ...   
271      39.463440  -0.409316      20.0  1.413765e+09  2014.0   10.0   
272      39.490257  -0.406462      18.0  1.413765e+09  2014.0   10.0   
273      39.494136  -0.406020      16.0  1.413765e+09  2014.0   10.0   
274      39.480591  -0.332281      15.0  1.413765e+09  2014.0   10.0   
275      39.450273  -0.333363      15.0  1.413765e+09  2014.0   10.0   

               day       hour   weekhour  isHoliday  ...  \
station                                              ...   
1        16.014885  11.529093  84.431664   0.063599  ...   
2        16.014885  11.529093  84.431664   0.063599  ...   
3        16.014885  11.529093  84.431664   0.063599  ...   
4        16.014885  11.529093  84.431664   0.063599  ...   
5        16.014885  11.529093  84.431664   0.063599  ...   
...            ...        ...        ...        ...  ...   
271      19.610915  11.593310  84.593310   0.082746  ...   
272      19.610915  11.593310  84.593310   0.082746  ...   
273      19.610915  11.593310  84.593310   0.082746  ...   
274      19.610915  11.593310  84.593310   0.082746  ...   
275      19.610915  11.593310  84.593310   0.082746  ...   

         short_profile_3h_diff_bikes  short_profile_bikes     bikes    monday  \
station                                                                         
1                           0.000034             0.178112  0.160487  0.129905   
2                           0.000541             0.204308  0.208119  0.129905   
3                           0.000519             0.285002  0.227785  0.129905   
4                          -0.000288             0.223951  0.215562  0.129905   
5                           0.001486             0.234306  0.231217  0.129905   
...                              ...                  ...       ...       ...   
271                         0.000587             0.325572  0.273063  0.126761   
272                         0.000807             0.388180  0.573063  0.126761   
273                         0.001284             0.357220  0.451364  0.126761   
274                        -0.002690             0.762109  0.707864  0.126761   
275                        -0.002543             0.712500  0.720540  0.126761   

          tuesday  wednesday  thursday    friday  saturday    sunday  
station                                                               
1        0.128552   0.162382  0.161028  0.159675  0.129905  0.128552  
2        0.128552   0.162382  0.161028  0.159675  0.129905  0.128552  
3        0.128552   0.162382  0.161028  0.159675  0.129905  0.128552  
4        0.128552   0.162382  0.161028  0.159675  0.129905  0.128552  
5        0.128552   0.162382  0.161028  0.159675  0.129905  0.128552  
...           ...        ...       ...       ...       ...       ...  
271      0.125000   0.163732  0.167254  0.165493  0.126761  0.125000  
272      0.125000   0.163732  0.167254  0.165493  0.126761  0.125000  
273      0.125000   0.163732  0.167254  0.165493  0.126761  0.125000  
274      0.125000   0.163732  0.167254  0.165493  0.126761  0.125000  
275      0.125000   0.163732  0.167254  0.165493  0.126761  0.125000  

[275 rows x 30 columns]

# This function finds the centre of all the points through implementing a kmeans cluster algorithm with 1 centroid, it takes a dataframe as the argument
def find_geocentre(df):
    # Utilise average bikes to get the mean value of longitude and latitude
    average_df = average_bikes(df)
    # Implement KMeans, from sklearn, with 1 clusters
    kmeans = KMeans(n_clusters=1, n_init='auto')
    # Fit the kmeans to the latitude and longitude
    kmeans.fit(average_df[['latitude', 'longitude']])
    return list(kmeans.cluster_centers_)

# Uncomment the below code to test the above function
# find_geocentre(preprocessing(load_training_data(1,275)))

# This function finds the distance to the centre for every station, it takes a dataframe as the argument
def distance_to_geocentre(df):
    # calls find_geocentre to find the midpoint
    centroid = find_geocentre(df)
    # for every station, find the station's coordinates and then find the WGS-84 distance between them (accounting for earth curvature)
    # store the result as the value for every entry of a station
    # Using geopy as it implements WGS-84 (https://en.wikipedia.org/wiki/Haversine_formula)
    for station in list(df['station'].unique()):
        station_df=df[(df['station']==station)]
        station_coords=list(station_df[['latitude','longitude']].mean())
        station_distance=geopy.distance.geodesic(station_coords, centroid).km
        df.loc[df['station']==station, 'distance_to_geocentre'] = station_distance
    return df

# Uncomment the below code to test the above function
# distance_to_geocentre(preprocessing(load_training_data(1,275)))

# Calculates the distance between a station and all other stations, then sorts them by distance. Takes a dataframe as the argument
def distance_to_nearest_stations(df):
    # Creates an empty dictionary
    station_distance_dict=dict()
    # For every station create a station dataframe of only that station's values, then create an array of the station's location
    for station in list(df['station'].unique()):
        station_distance=[]
        station_df=df[(df['station']==station)]
        station_coords=list(station_df[['latitude','longitude']].mean())
        # for station pair, find the stations' coordinates and then find the WGS-84 distance between them (accounting for earth curvature)
        # store the results in a list for the station
        # Using geopy as it implements WGS-84 (https://en.wikipedia.org/wiki/Haversine_formula)
        for check_station in list(df['station'].unique()):
            if check_station != station:
                check_station_df=df[(df['station']==check_station)]
                check_station_coords=list(check_station_df[['latitude','longitude']].mean())
                # Using geopy as it implements WGS-84 (https://en.wikipedia.org/wiki/Haversine_formula)
                station_distance.append(geopy.distance.geodesic(station_coords, check_station_coords).km)
        # Sort the values and store them in the dictionary using the station as the key
        station_distance.sort()
        station_distance_dict[station]=station_distance
    return station_distance_dict

# Uncomment the below code to test the above function
# distance_to_nearest_stations(preprocessing(load_training_data(1,275)))+

# Finds the average distance between k stations, requires a dataframe, list of station distances in order, and a k value (defaulted to 1)
def average_distance_to_nearest_k_stations(df, station_distances, k=1):
    # For every station go through the first k values of the list of distances, dividing each value by k to find the average
    for station in list(df['station'].unique()):
        average_k_station_distance=0
        for i in range(k):
            average_k_station_distance+=station_distances[station][i]/k
        # Store the distance as the value for every entry of a station
        df.loc[df['station']==station, f'nearest_{k}_stations_dist'] = average_k_station_distance
    return df

# Uncomment the below code to test the above function
# average_distance_to_nearest_k_stations(preprocessing(load_training_data(1,275)), distance_to_nearest_stations(preprocessing(load_training_data(1,275))))

# Finds the number of stations with k km, requires a dataframe, list of station distances in order, and a k value (defaulted to 0.25)
def stations_within_k_km(df, station_distances, k=0.25):
    # for every station, go through the distances and add 1 to the count for every distance below or equal to k. 
    # Assumes order, so breaks on first value that doesn't meet the condition 
    for station in list(df['station'].unique()):
        number_stations=0
        for i in range(len(station_distances[station])):
            if station_distances[station][i] <=0.25:
                number_stations+=1
            else:
                break
        # Store the distance as the value for every entry of a station
        df.loc[df['station']==station, f'stations_within_{k}_km'] = number_stations
    return df

# Uncomment the below code to test the above function
# stations_within_k_km(preprocessing(load_training_data(1,275)), distance_to_nearest_stations(preprocessing(load_training_data(1,275))))

# Function that runs all the 'Exploring Location' functions and returns the dataframe
def exploring_location(df):
    station_distances = distance_to_nearest_stations(df)
    df = distance_to_geocentre(df)
    df = stations_within_k_km(df, station_distances)
    for i in range(5):
        df = average_distance_to_nearest_k_stations(df, station_distances, i+1)
    return df

# Uncomment the below code to test the above function
# exploring_location(preprocessing(load_training_data(1,275)))

# Modified correlation function that reduces the values shown, requires a average_distance_to_nearest_k_stations for 1-5
def simple_location_correlation_matrix(df, show=False):
    df=df[['latitude', 'longitude', 'numDocks', 'bikes_3h_ago', 'full_profile_3h_diff_bikes', 'short_profile_3h_diff_bikes', 'full_profile_bikes', 'short_profile_bikes', 'bikes', 'distance_to_geocentre', 'nearest_1_stations_dist', 'nearest_2_stations_dist', 'nearest_3_stations_dist', 'nearest_4_stations_dist', 'nearest_5_stations_dist', 'stations_within_0.25_km']]
    # Uses pandas corr() to get correlation matrix
    correlation=df.corr()
    # If the function is called to show the function
    if show:
        # Sets figure size
        plt.figure(figsize=(20, 10))
        # Plots the heatmap, with 1 and -1 as boundaries and annotations. FMT rounds the values in the figure
        plot=sb.heatmap(correlation,  cmap="YlGnBu",vmin=-1, vmax=1, annot=True, fmt=".2f")
        # Adds a title
        plot.set_title('Correlation Heatmap', fontdict={'fontsize':18}, pad=12)
        # Saves the map as heatmap.png for later viewing.
        plt.savefig('simple_heatmap.png', dpi=300, bbox_inches='tight')
    return correlation

# Uncomment the below code to test
simple_location_correlation_matrix(exploring_location(preprocessing(load_training_data(1,275))), True)

# Uncomment the below code to test
feature_wrapping(exploring_location(preprocessing(load_training_data(1,275))))

/opt/python/envs/default/lib/python3.8/site-packages/numpy/core/_methods.py:269: RuntimeWarning: Degrees of freedom <= 0 for slice
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
/opt/python/envs/default/lib/python3.8/site-packages/numpy/core/_methods.py:261: RuntimeWarning: invalid value encountered in scalar divide
  ret = ret.dtype.type(ret / rcount)

['bikes_3h_ago',
 'full_profile_3h_diff_bikes',
 'full_profile_bikes',
 'short_profile_3h_diff_bikes']

# Takes two arguments, a dataframe and optional show. Returns a tree and its features (x value it was trained on)
def create_tree(df, show=False):
    # Sets y to bikes
    y = df['bikes']
    # Uses feature wrapping to reduce the features, then trains the tree model on the features 
    features = list(feature_wrapping(df))
    x = df[features]
    # Create a decision tree regressor
    tree = DecisionTreeRegressor()
    # Fit the decision tree on your data
    tree.fit(x, y)
    # If show, create a figure from the tree
    if show:
        fig, ax = plt.subplots(figsize=(10, 6))
        plot_tree(tree, filled=True, feature_names=features, ax=ax)
        fig.show()
        plt.savefig('subgroups.png', dpi=300, bbox_inches='tight')
    return tree, x

import numpy as np

# Code from https://github.com/scikit-learn/scikit-learn/issues/20428
def export_dict(tree, feature_names=None):
    if feature_names is None:
        feature_names = range(tree.max_features_)

    # Build tree nodes
    tree_nodes = []
    for i in range(tree.node_count):
        if tree.children_left[i] == tree.children_right[i]:
            tree_nodes.append(
                tree.classes_[np.argmax(tree.value[i])]
            )
        else:
            tree_nodes.append({
                "feature": feature_names[tree.feature[i]],
                "value": tree.threshold[i],
                "left": tree.children_left[i],
                "right": tree.children_right[i],
            })

    # Link tree nodes
    for node in tree_nodes:
        if isinstance(node, dict):
            node["left"] = tree_nodes[node["left"]]
        if isinstance(node, dict):
            node["right"] = tree_nodes[node["right"]]

    # Return root node
    return tree_nodes[0]

# Takes a datafrane and show to pass to create_tree
def get_subgroups(df, show=False):
    # Creates a tree and identifies the first nodes
    tree, features = create_tree(df, show)
    tree_ = tree.tree_
    # Gets the subgroups through the export_dict function, then prints the subgroups out
    # Commented out due to computational requirements
    # export_dict(tree_, features.columns)
    print("Extracted Subgroups:")
    for subgroup, prediction in subgroups:
        print(f"Subgroup Condition: {subgroup}, Predicted Value: {prediction}")

get_subgroups(preprocessing(load_training_data(200,275)), True)

Error in callback <function _draw_all_if_interactive at 0x7fe417b705e0> (for post_execute):
Error in callback <function flush_figures at 0x7fe40dfd69d0> (for post_execute):

/opt/python/envs/default/lib/python3.8/site-packages/numpy/core/_methods.py:269: RuntimeWarning: Degrees of freedom <= 0 for slice
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
/opt/python/envs/default/lib/python3.8/site-packages/numpy/core/_methods.py:261: RuntimeWarning: invalid value encountered in scalar divide
  ret = ret.dtype.type(ret / rcount)

---------------------------------------------------------------------------
Traceback (most recent call last)
    at line 120 in flush_figures()
    at line 40 in show(close, block)
    at line 320 in display(include, exclude, metadata, transient, display_id, *objs, **kwargs)
    at line 180 in format(self, obj, include, exclude)
    at line 232 in fun(*args, **kw)
    at line 224 in catch_format_error(method, self, *args, **kwargs)
    at line 341 in __call__(self, obj)
    at line 151 in print_figure(fig, fmt, bbox_inches, base64, **kwargs)
    at line 2342 in print_figure(self, filename, dpi, facecolor, edgecolor, orientation, format, bbox_inches, pad_inches, bbox_extra_artists, backend, **kwargs)
    at line 95 in draw_wrapper(artist, renderer, *args, **kwargs)
    at line 72 in draw_wrapper(artist, renderer)
    at line 3140 in draw(self, renderer)
    at line 131 in _draw_list_compositing_images(renderer, parent, artists, suppress_composite)
    at line 72 in draw_wrapper(artist, renderer)
    at line 3064 in draw(self, renderer)
    at line 131 in _draw_list_compositing_images(renderer, parent, artists, suppress_composite)
    at line 72 in draw_wrapper(artist, renderer)
    at line 2032 in draw(self, renderer)
    at line 39 in draw_wrapper(artist, renderer, *args, **kwargs)
    at line 4352 in draw(self, renderer)
    at line 4327 in _get_path_in_displaycoord(self)
    at line 2732 in __call__(self, posA, posB, shrinkA, shrinkB, patchA, patchB)
    at line 2710 in _clip(self, path, in_start, in_stop)
    at line 370 in split_path_inout(path, inside, tolerance, reorder_inout)
    at line 333 in split_bezier_intersecting_with_closedpath(bezier, inside_closedpath, tolerance)
    at line 169 in find_bezier_t_intersecting_with_closedpath(bezier_point_at_t, inside_closedpath, t0, t1, tolerance)
    at line 222 in point_at_t(self, t)
    at line 215 in __call__(self, t)
KeyboardInterrupt:

# The below image can be accessed through the datalore 'get_tree' show 
img = Image.open('subgroups.png')
display(img)

# Function takes a dataframe and the regressor
def model_regression(df, regr): 
    # Identifies the standard target variables and features from earlier
    target_column = 'bikes'
    features=df.drop(columns=[target_column])
    # Fits the model to the features and target column
    regr.fit(features, df[target_column])
    # Reads the csv file of test data
    test=pd.read_csv('test.csv')
    # Creates a bikes column to enable preprocessing
    test['bikes']=0
    # Preprocesses to vectorise days of the week, but does not drop NA values
    test=preprocessing(test, False)
    # creates a second test df for organising the results
    test_2=test.drop(columns=['bikes', 'Id'])
    # Predicts using the regression
    pred = regr.predict(test_2)
    # Standard results format from earlier, with csv named after regressor and parameters
    results_df = pd.DataFrame({
        'Id': list(test['Id']),
        'bikes': pred
    })
    results_df['bikes']=results_df['bikes']*test['numDocks']
    results_df = results_df.set_index('Id')
    results_df = results_df.round(0)
    results_df.to_csv(f'submission_{str(regr)}.csv')

#from sklearn.ensemble import RandomForestRegressor
#model_regression(preprocessing(load_training_data(200,275)), RandomForestRegressor(max_depth=2, random_state=0))
#from sklearn.ensemble import GradientBoostingRegressor
#model_regression(preprocessing(load_training_data(200,275)), GradientBoostingRegressor(random_state=0))
#from sklearn.linear_model import Ridge
#model_regression(preprocessing(load_training_data(200,275)), Ridge(alpha=.5))
#from sklearn.svm import SVR
#model_regression(preprocessing(load_training_data(200,275)), SVR())

# Results manually input from kaggle competition MAE
results=pd.DataFrame({
    'Index': ['Results'],
    'RandomForest': [3.14222],
    'GradientBoostingRegressor': [2.16266],
    'Ridge': [75.78044],
    'SVR': [5.35377],
})
results.set_index('Index', inplace=True)
display(results)

	latitude	longitude	numDocks	bikes_3h_ago	full_profile_3h_diff_bikes	short_profile_3h_diff_bikes	full_profile_bikes	short_profile_bikes	bikes	distance_to_geocentre	nearest_1_stations_dist	nearest_2_stations_dist	nearest_3_stations_dist	nearest_4_stations_dist	nearest_5_stations_dist	stations_within_0.25_km
latitude	1.000000	-0.134558	-0.019958	-0.140002	-0.000002	0.000064	-0.248473	-0.195587	-0.139949	0.046926	0.044670	0.061238	0.033212	0.019192	0.012553	-0.039158
longitude	-0.134558	1.000000	0.198852	0.216572	0.000097	0.000918	0.324791	0.210502	0.216655	0.015600	-0.173437	-0.195080	-0.160594	-0.143028	-0.130643	0.108470
numDocks	-0.019958	0.198852	1.000000	-0.143889	0.001580	0.001284	-0.182501	-0.195397	-0.143243	-0.223029	-0.392536	-0.299922	-0.270895	-0.254004	-0.246160	0.470361
bikes_3h_ago	-0.140002	0.216572	-0.143889	1.000000	-0.232694	-0.244500	0.475698	0.451505	0.678960	0.153801	0.072238	0.033434	0.039563	0.043197	0.049235	-0.142489
full_profile_3h_diff_bikes	-0.000002	0.000097	0.001580	-0.232694	1.000000	0.871116	0.369663	0.352138	0.263766	-0.002281	-0.001334	-0.001056	-0.001075	-0.001122	-0.001188	0.001321
short_profile_3h_diff_bikes	0.000064	0.000918	0.001284	-0.244500	0.871116	1.000000	0.312002	0.392555	0.262511	-0.001694	-0.000832	-0.000532	-0.000416	-0.000435	-0.000484	0.000900
full_profile_bikes	-0.248473	0.324791	-0.182501	0.475698	0.369663	0.312002	1.000000	0.873589	0.646214	0.204242	0.135493	0.074712	0.071045	0.070768	0.075675	-0.191238
short_profile_bikes	-0.195587	0.210502	-0.195397	0.451505	0.352138	0.392555	0.873589	1.000000	0.642296	0.183947	0.114036	0.050962	0.046381	0.045245	0.051879	-0.194993
bikes	-0.139949	0.216655	-0.143243	0.678960	0.263766	0.262511	0.646214	0.642296	1.000000	0.152510	0.071453	0.032804	0.039053	0.042739	0.048800	-0.141641
distance_to_geocentre	0.046926	0.015600	-0.223029	0.153801	-0.002281	-0.001694	0.204242	0.183947	0.152510	1.000000	0.498209	0.516526	0.548141	0.560127	0.578805	-0.313437
nearest_1_stations_dist	0.044670	-0.173437	-0.392536	0.072238	-0.001334	-0.000832	0.135493	0.114036	0.071453	0.498209	1.000000	0.883601	0.795692	0.745018	0.718233	-0.659941
nearest_2_stations_dist	0.061238	-0.195080	-0.299922	0.033434	-0.001056	-0.000532	0.074712	0.050962	0.032804	0.516526	0.883601	1.000000	0.968816	0.936948	0.913767	-0.478593
nearest_3_stations_dist	0.033212	-0.160594	-0.270895	0.039563	-0.001075	-0.000416	0.071045	0.046381	0.039053	0.548141	0.795692	0.968816	1.000000	0.991643	0.979736	-0.388097
nearest_4_stations_dist	0.019192	-0.143028	-0.254004	0.043197	-0.001122	-0.000435	0.070768	0.045245	0.042739	0.560127	0.745018	0.936948	0.991643	1.000000	0.995920	-0.343126
nearest_5_stations_dist	0.012553	-0.130643	-0.246160	0.049235	-0.001188	-0.000484	0.075675	0.051879	0.048800	0.578805	0.718233	0.913767	0.979736	0.995920	1.000000	-0.324494
stations_within_0.25_km	-0.039158	0.108470	0.470361	-0.142489	0.001321	0.000900	-0.191238	-0.194993	-0.141641	-0.313437	-0.659941	-0.478593	-0.388097	-0.343126	-0.324494	1.000000

Table of Contents¶

Notebook Overview¶

Competition Description¶

Library Imports¶

Phase 1¶

Competition Description¶

Overview¶

Data Loading¶

Preprocessing¶

Data Analysis¶

Feature Selection¶

Modelling¶

Outputs and Evaluation¶

Phase 2¶

Competition Description¶

Overview¶

Model Loading¶

Model Creation¶

Model Selection¶

Modelling¶

Outputs and Evaluation¶

Phase 3¶

Competition Description¶

Overview¶

Merging Best Methods¶

Exploring Location¶

Subgroup Exploration¶

Implementing Further Models¶

	Filtering	Wrapping	Combination	Combination-intersect	LASSO
Index
single	2.65600	2.33777	2.36533	2.61244	5.37866
separate	5.45422	1286.77866	1294.01244	NaN	8.62933

	Cumulative full	Cumulative full-temp	Cumulative short	Cumulative short-temp	Cumulative short-full	Cumulative short-full-temp	Cumulative specific
Index
Results	3.52444	4.54844	12.27466	13.72177	3.77777	4.62222	2.23377

	RandomForest	GradientBoostingRegressor	Ridge	SVR
Index
Results	3.14222	2.16266	75.78044	5.35377