!pip install xgboost

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import itertools

from scipy.stats import uniform
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, FunctionTransformer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, StackingClassifier
from xgboost import XGBClassifier

from pandas.api.types import CategoricalDtype
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer, KNNImputer, MissingIndicator

import requests
from IPython.core.display import HTML
styles = requests.get(
    "https://raw.githubusercontent.com/Harvard-IACS/2021-CS109A/master/"
    "themes/static/css/cs109.css"
).text
HTML(styles)

# Load in the data (2018, 2017)
df_2018 = pd.read_csv('data/2018_Financial_Data.csv')
df_2018 = df_2018.rename(columns={'Unnamed: 0':'Ticker'})

df_2017 = pd.read_csv('data/2017_Financial_Data.csv')
df_2017 = df_2017.rename(columns={'Unnamed: 0':'Ticker'})

df_2018 = df_2018.set_index('Ticker')
df_2017 = df_2017.set_index('Ticker')

df_2017 = df_2017.reindex(df_2018.index)

df_2018.head()

df_2018.describe()

df_2018.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4392 entries, CMCSA to ZYME
Columns: 224 entries, Revenue to Class
dtypes: float64(222), int64(1), object(1)
memory usage: 7.5+ MB

# Calculate value counts
df_2018['Class'].value_counts()

Class
1    3046
0    1346
Name: count, dtype: int64

sns.histplot(df_2018, x='2019 PRICE VAR [%]')
plt.xlabel('2019 Price Variation [%] ')
plt.title('Distribution of Price Variations')
plt.axvline(x=0, color='red', linestyle='--', linewidth=2)
plt.xlim(-110, 200)
plt.show()

sns.histplot(df_2018, x='Revenue', log_scale=True)
plt.xlabel('Revenue (log Scale)')
plt.title('Distribution of Company Revenue')
plt.show()

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Counts
sns.histplot(df_2018, x='Revenue', hue='Class', log_scale=True, ax=axes[0])
axes[0].set_xlabel('Revenue (log Scale)')
axes[0].set_title('Distribution of Company Revenue with Class(Count)')

# Proportion
sns.histplot(df_2018, x='Revenue', hue='Class', log_scale=True, stat='proportion', common_norm=False, ax=axes[1])
axes[1].set_xlabel('Revenue (log Scale)')
axes[1].set_title('Distribution of Company Revenue with Class(Proportion)')

plt.tight_layout()
plt.show()

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Revenue vs. 2019 Price Change
sns.scatterplot(x='Revenue', y='2019 PRICE VAR [%]', data=df_2018, ax=axes[0])
axes[0].set_title('Revenue vs. 2019 Price Change (SymLog Scale)')
axes[0].set_xlabel('Revenue')
axes[0].set_ylabel('2019 Price Change (%)')

# horizontal line at y=0
axes[0].axhline(y=0, color='red', linestyle='--', linewidth=2)
axes[0].set_xscale('log')
axes[0].set_yscale('symlog')

# Revenue Growth vs. 2019 Price Change
sns.scatterplot(x='Revenue Growth', y='2019 PRICE VAR [%]', data=df_2018, ax=axes[1])
axes[1].set_title('Revenue Growth vs. 2019 Price Change (SymLog Scale)')
axes[1].set_xlabel('Revenue Growth')
axes[1].set_ylabel('2019 Price Change (%)')

# horizontal line at y=0
axes[1].axhline(y=0, color='red', linestyle='--', linewidth=2)
axes[1].set_xscale('symlog')
axes[1].set_yscale('symlog')

plt.tight_layout()
plt.show()

df_2018['Sector'].value_counts()

Sector
Financial Services        824
Healthcare                691
Technology                636
Industrials               574
Consumer Cyclical         506
Basic Materials           276
Real Estate               255
Energy                    248
Consumer Defensive        191
Utilities                 102
Communication Services     89
Name: count, dtype: int64

# Calculate value counts
sector_counts = df_2018['Sector'].value_counts()

# Plot the bar chart
fig, ax1 = plt.subplots()

# Plot count on the left y-axis
ax1.bar(sector_counts.index, sector_counts.values)
ax1.set_xlabel('Sector')
ax1.set_ylabel('Count')
ax1.set_xticks(range(len(sector_counts)))
ax1.set_xticklabels(sector_counts.index, rotation=45, ha="right")
ax1.tick_params('y')
ax1.set_title(r'Distribution of $Sector$')

# Create a twin Axes on the right side for proportion
ax2 = ax1.twinx()
ax2.set_ylim(0,.2)
ax2.set_ylabel('Proportion')

# Calculate proportion and plot on the right y-axis
proportion = sector_counts / sector_counts.sum()
ax2.bar(sector_counts.index, proportion)

plt.show()

sns.histplot(data=df_2018, x='Sector', hue='Class', multiple="stack", shrink=0.8)
plt.title('Breakdown of Sector Counts with Class Hue')
plt.xlabel('Sector')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')

plt.show()

all_predictors = df_2018.drop(columns='2019 PRICE VAR [%]').columns

corrs = df_2018.drop(columns='2019 PRICE VAR [%]').corr(numeric_only=True)
correlation_with_class = corrs['Class'].sort_values()[-21:-2]

correlation_with_class.plot(kind='barh')
plt.title('Top 20 Predictors by Correlation with Response (Class)')
plt.xlabel('Correlation')
plt.ylabel('Predictor')
plt.show()

num_cols = df_2018.columns.drop(['Class', 'Sector', '2019 PRICE VAR [%]'])

perc_change_df = (df_2018[num_cols] - df_2017[num_cols]) / df_2017[num_cols]
perc_change_df.columns = perc_change_df.columns + ' (YoY)'

df = pd.concat([df_2018, perc_change_df], axis=1)
display(df.shape, df.head())

(4392, 445)

duplicate_cols = list()
for i, col1 in enumerate(df.columns):
    for col2 in df.columns[i + 1:]:
        if df[col1].equals(df[col2]):
            duplicate_cols.append((col1, col2))
cols_to_drop = [pair[0] for pair in duplicate_cols]
df = df.drop(columns=cols_to_drop)
df.shape

(4392, 397)

ks = list(range(0, 4392, 12))
# Store the number of columns with > k missing values
num_cols = []
for k in ks:
    # Count the number of missing values per column
    missing_values_count = df.isnull().sum()

    # Filter columns with more than k missing values
    low_quality_columns = missing_values_count[missing_values_count >= k].index.tolist()

    # Append the result to num_cols
    num_cols.append(low_quality_columns)

# Flatten the list of lists
num_cols_flat = list(itertools.chain.from_iterable(num_cols))

plt.plot(ks, [len(cols) for cols in num_cols])
plt.xlabel(rf'$k$')
plt.axvline(x=2000, c='k', ls='--', label=r'$k=2000$ (cutoff)')
plt.title(r'Number of Columns with at least $k$ missing values')
plt.legend();

sparse_cols = df.count()[df.count() <= 2000].index
df = df.drop(columns=sparse_cols)

df = df.rename(columns={'2019 PRICE VAR [%]': 'Variation'})

X = df.drop(columns=['Variation', 'Class'])
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=209)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3513, 376), (879, 376), (3513,), (879,))

oh_encoder = OneHotEncoder(drop=None, handle_unknown='ignore')
X_train_ohe = oh_encoder.fit_transform(X_train[['Sector']])
X_train_ohe = pd.DataFrame(X_train_ohe.toarray(), columns=oh_encoder.get_feature_names_out(['Sector']))
X_train_ohe.index = X_train.index
X_train = pd.concat([X_train.drop(columns=['Sector']), X_train_ohe], axis=1)

X_test_ohe = oh_encoder.transform(X_test[['Sector']])
X_test_ohe = pd.DataFrame(X_test_ohe.toarray(), columns=oh_encoder.get_feature_names_out(['Sector']))
X_test_ohe.index = X_test.index
X_test = pd.concat([X_test.drop(columns=['Sector']), X_test_ohe], axis=1)

# Function to plot the correlations by sector
sectors = df['Sector'].unique().tolist()
# Initialize dictionary to store predictor-response correlations within each sector
sector_corrs = {}

# Store the correlations between the response and each predictor acros ALL sectors (i.e., the whole dataset)
all_corr = df.corrwith(df['Class'], numeric_only=True)
sector_corrs['All'] = all_corr

# Store correlations for the data within each sector
for sector in sectors:
    corrs = df[df['Sector'] == sector].corrwith(df['Class'], numeric_only=True)
    sector_corrs[sector] = corrs

# Convert to df and drop response (trivial; corr = 1)
sector_corrs_df = pd.DataFrame(sector_corrs)
sector_corrs_df = sector_corrs_df.drop('Class')

# Plot
melted_df = sector_corrs_df.melt(var_name='Sector', value_name='Correlation')

plt.figure(figsize=(15, 8))
sns.boxplot(x='Sector', y='Correlation', data=melted_df)
plt.title('Predictor-Response Correlations', fontsize=15)
plt.xticks(rotation=45, ha='right')
plt.show()

/Users/Tomas/micromamba/envs/cs109a/lib/python3.11/site-packages/numpy/lib/function_base.py:2897: RuntimeWarning: invalid value encountered in divide
  c /= stddev[:, None]
/Users/Tomas/micromamba/envs/cs109a/lib/python3.11/site-packages/numpy/lib/function_base.py:2898: RuntimeWarning: invalid value encountered in divide
  c /= stddev[None, :]
/Users/Tomas/micromamba/envs/cs109a/lib/python3.11/site-packages/numpy/lib/function_base.py:2742: RuntimeWarning: invalid value encountered in subtract
  X -= avg[:, None]
/Users/Tomas/micromamba/envs/cs109a/lib/python3.11/site-packages/numpy/core/_methods.py:118: RuntimeWarning: invalid value encountered in reduce
  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)

# Replace infinity values with NaN
X_train = X_train.replace([np.inf, -np.inf], np.nan)
X_test = X_test.replace([np.inf, -np.inf], np.nan)

# Define a function to apply a transformation to data and return them
def transform_data(transformer, X_train, X_test):
    X_train_trans = transformer.fit_transform(X_train)
    X_train_trans = pd.DataFrame(X_train_trans, columns=transformer.get_feature_names_out(), index=X_train.index)
    X_test_trans = transformer.transform(X_test)
    X_test_trans = pd.DataFrame(X_test_trans, columns=transformer.get_feature_names_out(), index=X_test.index)
    return X_train_trans, X_test_trans

std_scaler = StandardScaler()
X_train_std, X_test_std = transform_data(std_scaler, X_train, X_test)

mean_imp = SimpleImputer(strategy='mean')
X_train_final, X_test_final = transform_data(mean_imp, X_train_std, X_test_std)

# knn_imp = KNNImputer(n_neighbors=3)
# X_train_knnimp, X_test_knnimp = transform_data(knn_imp, X_train, X_test)

# Naive model (old data)
data = pd.Series()
for i in range(7, 3, -1):
    current_data = pd.read_csv(f'data/201{i}_Financial_Data.csv')['Class']
    data = pd.concat([data, current_data], ignore_index=True)

print(f"{data.mean() * 100:.2f}% of stocks in the data from 2014-2017 improved.")
print(f"{df_2018['Class'].mean() * 100:.2f}% of stocks in the data from 2018 improved.")

51.53% of stocks in the data from 2014-2017 improved.
69.35% of stocks in the data from 2018 improved.

/var/folders/7h/kr5s22ts0hv9gdztycnz_rtw0000gn/T/ipykernel_56640/2735913412.py:5: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.
  data = pd.concat([data, current_data], ignore_index=True)

# Naive Model
y_pred_train_naive = np.ones_like(y_train)
y_pred_test_naive = np.ones_like(y_test)

naive_train_acc = accuracy_score(y_train, y_pred_train_naive)
naive_test_acc = accuracy_score(y_test, y_pred_test_naive)

print(f'Naive Model Train Accuracy: {naive_train_acc}')
print(f'Naive Model Test Accuracy: {naive_test_acc}')

Naive Model Train Accuracy: 0.695417022487902
Naive Model Test Accuracy: 0.6860068259385665

# Logistic Regression (L1)
Cs = [.005, .01, .02, .05]
lasso = LogisticRegressionCV(Cs=Cs, cv=5, penalty='l1', solver='saga', max_iter=1000, n_jobs=-1, random_state=209).fit(X_train_final, y_train)
best_C = lasso.C_[0]

y_pred_train_lasso = lasso.predict(X_train_final)
y_pred_test_lasso = lasso.predict(X_test_final)

lasso_train_acc = accuracy_score(y_train, y_pred_train_lasso)
lasso_test_acc = accuracy_score(y_test, y_pred_test_lasso)

# Store the top 10 predictors' coefs and their names
coefficients = lasso.coef_[0]
feature_names = X_train.columns
coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
coef_df = coef_df.reindex(coef_df['Coefficient'].abs().sort_values(ascending=False).index)
top_10_coefs = coef_df.head(10)

# Plot
plt.figure(figsize=(10, 6))
plt.barh(top_10_coefs['Feature'], top_10_coefs['Coefficient'], color='skyblue')
plt.xlabel('Coefficient Value')
plt.title('Top 10 Coefficients in the L1-regularized Logistic Regression')
plt.grid(axis='x')
plt.show()

# Store predictors deemed most important by LogisticRegressionCV (i.e., their coefficient was not driven to 0)
predictors = X_train_final.columns
important_predictors = []
for predictor, coef in zip(predictors, lasso.coef_.tolist()[0]):
    if coef > 0:
        important_predictors.append(predictor)

display(X_train_final.shape)

(3513, 386)

print(rf'Out of the original {len(predictors)} features, the L1-regularized logistic regression model drives the weights of {len(predictors) - len(important_predictors)} of them to 0.')

Out of the original 386 features, the L1-regularized logistic regression model drives the weights of 331 of them to 0.

# L1 Logistic Performance
print(f'L1 Logistic Train Accuracy: {lasso_train_acc}')
print(f'L1 Logistic Test Accuracy: {lasso_test_acc}')

L1 Logistic Train Accuracy: 0.7349843438656419
L1 Logistic Test Accuracy: 0.7076222980659841

# kNN Classifier
ks = range(10,101,10)
train_mse_means = []
val_mse_means = []

for k in ks:
    knn_cv = cross_validate(KNeighborsClassifier(n_neighbors=k),
                           X_train_final, y_train,cv=5,scoring='accuracy',return_train_score=True)
    train_mse_means.append(np.mean(knn_cv['train_score']))
    val_mse_means.append(np.mean(knn_cv['test_score']))

best_idx = np.argmax(val_mse_means)
best_k = ks[best_idx]

# Plot the accuracies
plt.plot(ks, train_mse_means)
plt.plot(ks, val_mse_means)
# Add labels
plt.xlabel(r'$k$')
plt.ylabel('accuracy')
plt.axvline(best_k, c='k', ls='--', label=rf'best $k={best_k}$')
plt.title(r'kNN Classifier CV Scores')
plt.xticks(ks)
plt.legend();

# Fit a final kNN model on all the data
knn = KNeighborsClassifier(n_neighbors=best_k).fit(X_train_final, y_train)

y_pred_train_knn = knn.predict(X_train_final)
y_pred_test_knn = knn.predict(X_test_final)

knn_train_acc = accuracy_score(y_train, y_pred_train_knn)
knn_test_acc = accuracy_score(y_test, y_pred_test_knn)

# kNN Performance
print(f'kNN Train Accuracy: {knn_train_acc}')
print(f'kNN Test Accuracy: {knn_test_acc}')

kNN Train Accuracy: 0.7122117847993168
kNN Test Accuracy: 0.6996587030716723

# Single Decision Tree CV
train_scores = []
cvmeans = []
depths = list(range(1,21))

for max_depth in depths:
    tree = DecisionTreeClassifier(max_depth=max_depth, random_state=109)
    tree.fit(X_train_final, y_train)
    train_scores.append(accuracy_score(y_train, tree.predict(X_train_final)))
    scores = cross_val_score(estimator=DecisionTreeClassifier(max_depth=max_depth, random_state=109), X=X_train_final, y=y_train, cv=5, n_jobs=-1)
    cvmeans.append(scores.mean())

best_idx = np.argmax(cvmeans)
best_depth_tree = depths[best_idx]

cvmeans = np.array(cvmeans)
depths = range(1, 21)

plt.plot(depths, train_scores, label = 'training scores')
plt.plot(depths, cvmeans, label = 'mean validation scores')
plt.axvline(best_depth_tree, c='k', ls='--', label=f'best depth={best_depth_tree}')

plt.title('Non-CV Training Accuracy vs. CV Validation Accuracy')

plt.xlabel('Max Tree Depth')
plt.xticks(range(0, 21, 2))
plt.ylabel('Accuracy Score')
plt.legend()
plt.show()

# Fit a final decision tree on all the data
tree = DecisionTreeClassifier(max_depth=best_depth_tree, random_state=0).fit(X_train_final, y_train)

y_pred_train_tree = tree.predict(X_train_final)
y_pred_test_tree = tree.predict(X_test_final)

tree_train_acc = accuracy_score(y_train, y_pred_train_tree)
tree_test_acc = accuracy_score(y_test, y_pred_test_tree)

# Plot the decision tree
plt.figure(figsize=(35, 10))
plot_tree(tree, filled=True, feature_names=X_train.columns, class_names=['0', '1'], rounded=True, fontsize=20)
plt.show()

# Tree Performance
print(f'Single Tree Train Accuracy: {tree_train_acc}')
print(f'Single Tree Test Accuracy: {tree_test_acc}')

Single Tree Train Accuracy: 0.7378309137489325
Single Tree Test Accuracy: 0.7053469852104665

# Max Depth CV - Bagging
oob_scores = []
depths = list(range(3,25))
for depth in depths:
    bagger = BaggingClassifier(estimator=DecisionTreeClassifier(max_depth=depth), n_estimators=100, oob_score=True, random_state=209
                               ).fit(X_train_final, y_train)
    oob_scores.append(bagger.oob_score_)

best_idx = np.argmax(oob_scores)
best_depth = depths[best_idx]

best_depth

14

# Bagging Performance
n_trees = 200
bagger = BaggingClassifier(estimator=DecisionTreeClassifier(max_depth=best_depth), n_estimators=n_trees, n_jobs=-1, random_state=209
                            ).fit(X_train_final, y_train)

y_train_pred_bagger = bagger.predict(X_train_final)
y_test_pred_bagger = bagger.predict(X_test_final)

bagger_train_acc = accuracy_score(y_train, y_train_pred_bagger)
bagger_test_acc = accuracy_score(y_test, y_test_pred_bagger)

bagging_predictors_1 = [bagged_tree.tree_.feature[0] for bagged_tree in bagger.estimators_]
bagging_counts_1 = pd.Series(bagging_predictors_1).value_counts()
bagging_predictors_2 = [bagged_tree.tree_.feature[1] for bagged_tree in bagger.estimators_]
bagging_counts_2 = pd.Series(bagging_predictors_2).value_counts()

top_predictors_bagging = pd.DataFrame(index=range(X_train_final.shape[1]))
top_predictors_bagging['predictor'] = X_train_final.columns[top_predictors_bagging.index]
top_predictors_bagging['first count'] = bagging_counts_1
# top_predictors_bagging['second count'] = bagging_counts_2
top_predictors_bagging = top_predictors_bagging.dropna(thresh=2)
top_predictors_bagging = top_predictors_bagging.reindex(top_predictors_bagging['first count'].sort_values(ascending=False).index)

plt.figure(figsize=(12, 8))
plt.barh(top_predictors_bagging['predictor'], top_predictors_bagging['first count'], color='skyblue')
plt.xlabel('Number of Times Used for the First Split')
plt.ylabel('Predictor')
plt.title('Top Node Predictors')
plt.gca().invert_yaxis()  # Invert y-axis for better readability
plt.show()

# Bagging Performance
print(f'Bagging Train Accuracy: {bagger_train_acc}')
print(f'Bagging Test Accuracy: {bagger_test_acc}')

Bagging Train Accuracy: 0.977227440933675
Bagging Test Accuracy: 0.7372013651877133

# Random Forest Classifier
train_acc_mean = []
val_acc_mcean = []
k = 5
depths = list(range(3,30))
# Cross Validation
for max_depth in depths:
    rf_scores = cross_validate(estimator=RandomForestClassifier(n_estimators=n_trees, max_depth=max_depth, max_features='sqrt', random_state=209),
                               X=X_train_final,
                               y=y_train,
                               scoring='accuracy',
                               cv=k,
                               n_jobs=-1,
                               return_train_score=True)
    train_acc_mean.append(np.mean(rf_scores['train_score']))
    val_acc_mcean.append(np.mean(rf_scores['test_score']))

# Get best depth for RF
best_idx = np.argmax(val_acc_mcean)
best_depth_rf = depths[best_idx]
print(f'The cross-validation results show that the best depth is {best_depth_rf}.')

The cross-validation results show that the best depth is 9.

# Plot the accuracies
plt.plot(depths, train_acc_mean)
plt.plot(depths, val_acc_mcean)
# Add labels
plt.xlabel(r'depth')
plt.ylabel('accuracy')
plt.axvline(best_depth_rf, c='k', ls='--', label=rf'best depth={best_depth_rf}')
plt.title(r'Random Forest CV Scores')
plt.xticks(range(3,31,3))
plt.legend();

n_trees = 200
tree_depth = best_depth_rf

# Fit a RF
random_forest = RandomForestClassifier(n_estimators=n_trees,
                                       max_depth=best_depth_rf,
                                       max_features='sqrt',
                                       random_state=0
                                       ).fit(X_train_final, y_train)

y_train_pred = random_forest.predict(X_train_final)
y_test_pred = random_forest.predict(X_test_final)

rf_train_acc = accuracy_score(y_train, y_train_pred)
rf_test_acc = accuracy_score(y_test, y_test_pred)

# RF Performance
print(f'RF Train Accuracy: {rf_train_acc}')
print(f'RF Test Accuracy: {rf_test_acc}')

RF Train Accuracy: 0.9373754625676061
RF Test Accuracy: 0.7485779294653014

# Random Forest Classifier (important predictors)
X_train_important = X_train_final[important_predictors]
X_test_important = X_test_final[important_predictors]

n_trees = 200
tree_depth = best_depth_rf

# Fit a RF
rf_important = RandomForestClassifier(n_estimators=n_trees,
                                       max_depth=tree_depth,
                                       max_features='sqrt',
                                       random_state=0
                                       ).fit(X_train_important, y_train)

y_train_important_pred = rf_important.predict(X_train_important)
y_test_important_pred = rf_important.predict(X_test_important)

rf_important_train_acc = accuracy_score(y_train, y_train_important_pred)
rf_important_test_acc = accuracy_score(y_test, y_test_important_pred)

# RF Performance, after lasso selection
print(f'RF Train Accuracy (full data): {rf_train_acc}')
print(f'RF Test Accuracy (full data): {rf_test_acc}')
print(f'RF Train Accuracy (lasso-selected predictors): {rf_important_train_acc}')
print(f'RF Test Accuracy (lasso-selected predictors): {rf_important_test_acc}')

RF Train Accuracy (full data): 0.9373754625676061
RF Test Accuracy (full data): 0.7485779294653014
RF Train Accuracy (lasso-selected predictors): 0.9154568744662681
RF Test Accuracy (lasso-selected predictors): 0.7281001137656428

# AdaBoost - (depth,iter,lr,acc): (2,150,.05) = .745, (1,200,.05)=.741, (2,200,.05)=.743, (2,200,0.3)=.744, (2,100,.075)=.743
ada_depth = 2
n_iters = 150
lr = 0.05
adaboost = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=ada_depth),
                              n_estimators=n_iters,
                              learning_rate=lr,
                              random_state=0
                             ).fit(X_train_final, y_train)

ada_train_acc = adaboost.score(X_train_final, y_train)
ada_test_acc = adaboost.score(X_test_final, y_test)

print(f'AdaBoost Train Accuracy: {ada_train_acc}')
print(f'AdaBoost Test Accuracy: {ada_test_acc}')

AdaBoost Train Accuracy: 0.7913464275547964
AdaBoost Test Accuracy: 0.7349260523321957

estimators = [('lasso', LogisticRegression(C=best_C, penalty='l1', solver='liblinear', random_state=209)),
              ('knn', KNeighborsClassifier(n_neighbors=best_k)),
              ('bagger', BaggingClassifier(estimator=DecisionTreeClassifier(max_depth=13), n_estimators=200, n_jobs=-1, random_state=209)),
              ('rf', RandomForestClassifier(n_estimators=200, max_depth=7, random_state=209)),
              ('adaboost', AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=2), n_estimators=150, learning_rate=0.05, random_state=209))]

stacker = StackingClassifier(estimators=estimators,
                             final_estimator = LogisticRegression(),
                             cv=5,
                             n_jobs=-1)

stacker.fit(X_train_final, y_train)

StackingClassifier(cv=5,
                   estimators=[('lasso',
                                LogisticRegression(C=0.05, penalty='l1',
                                                   random_state=209,
                                                   solver='liblinear')),
                               ('knn', KNeighborsClassifier(n_neighbors=60)),
                               ('bagger',
                                BaggingClassifier(estimator=DecisionTreeClassifier(max_depth=13),
                                                  n_estimators=200, n_jobs=-1,
                                                  random_state=209)),
                               ('rf',
                                RandomForestClassifier(max_depth=7,
                                                       n_estimators=200,
                                                       random_state=209)),
                               ('adaboost',
                                AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=2),
                                                   learning_rate=0.05,
                                                   n_estimators=150,
                                                   random_state=209))],
                   final_estimator=LogisticRegression(), n_jobs=-1)

StackingClassifier(cv=5,
                   estimators=[('lasso',
                                LogisticRegression(C=0.05, penalty='l1',
                                                   random_state=209,
                                                   solver='liblinear')),
                               ('knn', KNeighborsClassifier(n_neighbors=60)),
                               ('bagger',
                                BaggingClassifier(estimator=DecisionTreeClassifier(max_depth=13),
                                                  n_estimators=200, n_jobs=-1,
                                                  random_state=209)),
                               ('rf',
                                RandomForestClassifier(max_depth=7,
                                                       n_estimators=200,
                                                       random_state=209)),
                               ('adaboost',
                                AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=2),
                                                   learning_rate=0.05,
                                                   n_estimators=150,
                                                   random_state=209))],
                   final_estimator=LogisticRegression(), n_jobs=-1)

LogisticRegression(C=0.05, penalty='l1', random_state=209, solver='liblinear')

KNeighborsClassifier(n_neighbors=60)

DecisionTreeClassifier(max_depth=13)

DecisionTreeClassifier(max_depth=13)

RandomForestClassifier(max_depth=7, n_estimators=200, random_state=209)

DecisionTreeClassifier(max_depth=2)

DecisionTreeClassifier(max_depth=2)

LogisticRegression()

stacker_train_acc = stacker.score(X_train_final, y_train)
stacker_test_acc = stacker.score(X_test_final, y_test)

# Stacker Performance
print(f'Stacker Train Accuracy: {stacker_train_acc}')
print(f'Stacker Test Accuracy: {stacker_test_acc}')

Stacker Train Accuracy: 0.9288357529177341
Stacker Test Accuracy: 0.7474402730375427

rf_importances = pd.DataFrame(data=[random_forest.feature_importances_], columns=X_train_std.columns, index=['RF']).T.sort_values(by='RF', ascending = True)
ada_importances = pd.DataFrame(data=[adaboost.feature_importances_], columns=X_train_std.columns, index=['Ada']).T.sort_values(by='Ada', ascending = True)

ada_importances[-30:].sum()

Ada    0.577874
dtype: float64

rf_importances[-30:].plot(kind='barh')
plt.title('Top 30 Predictors for Random Forest')
plt.xlabel('Importance')
plt.xlim(0, 0.05)
plt.show()

ada_importances[-30:].plot(kind='barh')
plt.title('Top 30 Predictors for AdaBoost')
plt.xlabel('Importance')
plt.xlim(0, 0.05)
plt.show()

classifiers = ['Naive Model',
               'L1 Logistic Regression',
               'kNN Classifier',
               'Single Decision Tree',
               'Bagging Model',
               'Random Forest',
               'AdaBoost',
               'Stacking Model']
train_scores = [naive_train_acc, lasso_train_acc, knn_train_acc, tree_train_acc, bagger_train_acc, rf_train_acc, ada_train_acc, stacker_train_acc]
train_scores = [np.round(score * 100, 2) for score in train_scores]
test_scores = [naive_test_acc, lasso_test_acc, knn_test_acc, tree_test_acc, bagger_test_acc, rf_test_acc, ada_test_acc, stacker_test_acc]
test_scores = [np.round(score * 100, 2) for score in test_scores]

# Create the df
results_df = pd.DataFrame({'Training Accuracy':train_scores, 'Testing Accuracy':test_scores}, index=classifiers)

results_df

	Revenue	Revenue Growth	Cost of Revenue	Gross Profit	R&D Expenses	SG&A Expense	Operating Expenses	Operating Income	Interest Expense	Earnings before Tax	...	Receivables growth	Inventory Growth	Asset Growth	Book Value per Share Growth	Debt Growth	R&D Expense Growth	SG&A Expenses Growth	Sector	2019 PRICE VAR [%]	Class
Ticker
CMCSA	9.450700e+10	0.1115	0.000000e+00	9.450700e+10	0.000000e+00	6.482200e+10	7.549800e+10	1.900900e+10	3.542000e+09	1.511100e+10	...	0.2570	0.0000	0.3426	0.0722	0.7309	0.0000	0.1308	Consumer Cyclical	32.794573	1
KMI	1.414400e+10	0.0320	7.288000e+09	6.856000e+09	0.000000e+00	6.010000e+08	3.062000e+09	3.794000e+09	1.917000e+09	2.196000e+09	...	0.0345	-0.0920	-0.0024	0.0076	-0.0137	0.0000	-0.1265	Energy	40.588068	1
INTC	7.084800e+10	0.1289	2.711100e+10	4.373700e+10	1.354300e+10	6.750000e+09	2.042100e+10	2.331600e+10	-1.260000e+08	2.331700e+10	...	0.1989	0.0387	0.0382	0.1014	-0.0169	0.0390	-0.0942	Technology	30.295514	1
MU	3.039100e+10	0.4955	1.250000e+10	1.789100e+10	2.141000e+09	8.130000e+08	2.897000e+09	1.499400e+10	3.420000e+08	1.430300e+10	...	0.4573	0.1511	0.2275	0.6395	-0.5841	0.1738	0.0942	Technology	64.213737	1
GE	1.216150e+11	0.0285	9.546100e+10	2.615400e+10	0.000000e+00	1.811100e+10	4.071100e+10	-1.455700e+10	5.059000e+09	-2.177200e+10	...	-0.2781	-0.2892	-0.1575	-0.4487	-0.2297	0.0000	0.0308	Industrials	44.757840	1

	Revenue	Revenue Growth	Cost of Revenue	Gross Profit	R&D Expenses	SG&A Expense	Operating Expenses	Operating Income	Interest Expense	Earnings before Tax	...	3Y Dividend per Share Growth (per Share)	Receivables growth	Inventory Growth	Asset Growth	Book Value per Share Growth	Debt Growth	R&D Expense Growth	SG&A Expenses Growth	2019 PRICE VAR [%]	Class
count	4.346000e+03	4253.000000	4.207000e+03	4.328000e+03	4.155000e+03	4.226000e+03	4.208000e+03	4.357000e+03	4.208000e+03	4.321000e+03	...	4067.000000	4268.000000	4160.000000	4178.000000	4121.000000	4128.000000	4133.000000	4144.000000	4392.000000	4392.000000
mean	5.119287e+09	3.455278	3.144946e+09	2.043954e+09	1.180176e+08	9.005022e+08	1.435546e+09	6.541207e+08	1.001350e+08	5.584432e+08	...	0.006081	36.768524	0.183066	1.389013	0.262530	9.928446	0.091891	0.153610	20.803948	0.693534
std	2.049504e+10	195.504906	1.508813e+10	7.682369e+09	9.330891e+08	3.661116e+09	5.529831e+09	2.969341e+09	3.780021e+08	2.639327e+09	...	0.239653	2347.079237	4.688013	35.123904	5.612666	363.717734	0.823281	0.839647	82.622147	0.461078
min	-6.894100e+07	-3.461500	-2.669055e+09	-1.818220e+09	-1.042000e+08	-1.401594e+08	-4.280000e+09	-1.455700e+10	-1.408252e+09	-2.177200e+10	...	-1.000000	-1.000000	-1.000000	-0.999100	-32.258100	-1.000000	-1.000000	-1.000000	-99.864779	0.000000
25%	6.501425e+07	0.000000	3.415500e+06	3.618903e+07	0.000000e+00	2.056226e+07	4.223644e+07	-5.510000e+06	0.000000e+00	-1.000800e+07	...	0.000000	-0.048075	0.000000	-0.036700	-0.108600	-0.082850	0.000000	-0.004650	-7.477173	0.000000
50%	4.982640e+08	0.074900	1.741180e+08	2.219470e+08	0.000000e+00	9.390450e+07	1.806253e+08	4.203800e+07	5.693500e+06	2.730700e+07	...	0.000000	0.010200	0.000000	0.034750	0.026100	0.000000	0.000000	0.065700	17.639393	1.000000
75%	2.457878e+09	0.188500	1.297814e+09	9.767015e+08	1.450150e+07	4.117162e+08	6.796040e+08	2.862690e+08	5.817075e+07	2.238810e+08	...	0.042050	0.185900	0.080050	0.160575	0.138400	0.115425	0.009700	0.167625	39.625879	1.000000
max	5.003430e+11	12739.000000	3.733960e+11	1.269470e+11	2.883700e+10	1.065100e+11	1.065100e+11	7.089800e+10	9.168000e+09	7.290300e+10	...	4.079100	153332.333300	293.473000	1184.993800	313.395800	17646.823500	36.898100	43.718800	3756.716345	1.000000

	Revenue	Revenue Growth	Cost of Revenue	Gross Profit	R&D Expenses	SG&A Expense	Operating Expenses	Operating Income	Interest Expense	Earnings before Tax	...	10Y Dividend per Share Growth (per Share) (YoY)	5Y Dividend per Share Growth (per Share) (YoY)	3Y Dividend per Share Growth (per Share) (YoY)	Receivables growth (YoY)	Inventory Growth (YoY)	Asset Growth (YoY)	Book Value per Share Growth (YoY)	Debt Growth (YoY)	R&D Expense Growth (YoY)	SG&A Expenses Growth (YoY)
Ticker
CMCSA	9.450700e+10	0.1115	0.000000e+00	9.450700e+10	0.000000e+00	6.482200e+10	7.549800e+10	1.900900e+10	3.542000e+09	1.511100e+10	...	inf	1.394095	0.981435	1.325792	NaN	7.875648	-0.761243	11.711304	NaN	1.505747
KMI	1.414400e+10	0.0320	7.288000e+09	6.856000e+09	0.000000e+00	6.010000e+08	3.062000e+09	3.794000e+09	1.917000e+09	2.196000e+09	...	NaN	0.128674	-0.168657	-0.393673	-1.490144	-0.846154	-1.329004	-0.751361	NaN	4.938967
INTC	7.084800e+10	0.1289	2.711100e+10	4.373700e+10	1.354300e+10	6.750000e+09	2.042100e+10	2.331600e+10	-1.260000e+08	2.331700e+10	...	-0.107338	0.351598	0.245161	0.017391	-0.849709	-0.563927	1.086420	-1.279339	0.413043	-0.146739
MU	3.039100e+10	0.4955	1.250000e+10	1.789100e+10	2.141000e+09	8.130000e+08	2.897000e+09	1.499400e+10	3.420000e+08	1.430300e+10	...	NaN	NaN	NaN	-0.440748	0.865432	-0.196397	0.370847	-5.650478	0.357813	-0.261176
GE	1.216150e+11	0.0285	9.546100e+10	2.615400e+10	0.000000e+00	1.811100e+10	4.071100e+10	-1.455700e+10	5.059000e+09	-2.177200e+10	...	2.686084	-4.795148	12.712042	7.301493	1.202589	-15.189189	0.821762	17.230159	NaN	-3.933333

	Training Accuracy	Testing Accuracy
Naive Model	69.54	68.60
L1 Logistic Regression	73.50	70.76
kNN Classifier	71.22	69.97
Single Decision Tree	73.78	70.53
Bagging Model	97.72	73.72
Random Forest	93.74	74.86
AdaBoost	79.13	73.49
Stacking Model	92.88	74.74

Data Science 1: Introduction to Data Science

Final Project: Predicting Stock Prices from Financial Data¶

Import Libraries¶

Notebook Contents¶

1. Introduction¶

2. Data Description¶

Load in the data¶

3. EDA and Visualizations¶

Visualizations¶

4. Data Preprocessing¶

Add the YoY change for each variable¶

Preliminary Feature Selection¶

Remove Duplicates¶

Remove columns with at least $k$ missing values¶

Train-test Split¶

One-hot Encoding¶

Standardization¶

Imputation¶

5. Models¶

Naive Model¶

Logistic Regression w/ L1 Penalty Term¶

kNN Classification¶

Single Decision Tree¶

Bagging¶

Random Forest on All Predictors¶

Random Forest on Important Predictors¶

AdaBoost¶

Stacking (209)¶

6. Feature Importance¶

7. Results & Discussion¶

8. Future Work¶