# Let's start with our imports 
import pandas as pd
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm

import re
from datetime import datetime

import os
import glob

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotify_config import config

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline


                        
                            client_credentials_manager = SpotifyClientCredentials(config['client_id'],
                                                      config['client_secret'])
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)


                        
                            header = 0
dfs = []
for file in glob.glob('Charts/*/201?/*.csv'):
    region = file.split('\\')[1]
    dates = re.findall('\d{4}-\d{2}-\d{2}', file.split('/')[-1])
    weekly_chart = pd.read_csv(file, header=header, sep='\t')
    weekly_chart['week_start'] = datetime.strptime(dates[0], '%Y-%m-%d')
    weekly_chart['week_end'] = datetime.strptime(dates[1], '%Y-%m-%d')
    weekly_chart['region'] = region
    dfs.append(weekly_chart)

all_charts = pd.concat(dfs)
all_charts


                        
                            all_charts['song_id'].nunique()

13880


                        
                            sp.audio_features(['3js3wKPw8VxBWtcXtwyUnA', '4VVG3HBGaqSNZqIpmewIA6'])

[{'danceability': 0.798,
  'energy': 0.627,
  'key': 9,
  'loudness': -6.234,
  'mode': 1,
  'speechiness': 0.111,
  'acousticness': 0.263,
  'instrumentalness': 0,
  'liveness': 0.211,
  'valence': 0.762,
  'tempo': 149.989,
  'type': 'audio_features',
  'id': '3js3wKPw8VxBWtcXtwyUnA',
  'uri': 'spotify:track:3js3wKPw8VxBWtcXtwyUnA',
  'track_href': 'https://api.spotify.com/v1/tracks/3js3wKPw8VxBWtcXtwyUnA',
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/3js3wKPw8VxBWtcXtwyUnA',
  'duration_ms': 195947,
  'time_signature': 4},
 {'danceability': 0.777,
  'energy': 0.721,
  'key': 6,
  'loudness': -6.097,
  'mode': 1,
  'speechiness': 0.0719,
  'acousticness': 0.0774,
  'instrumentalness': 0,
  'liveness': 0.0801,
  'valence': 0.665,
  'tempo': 161.976,
  'type': 'audio_features',
  'id': '4VVG3HBGaqSNZqIpmewIA6',
  'uri': 'spotify:track:4VVG3HBGaqSNZqIpmewIA6',
  'track_href': 'https://api.spotify.com/v1/tracks/4VVG3HBGaqSNZqIpmewIA6',
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/4VVG3HBGaqSNZqIpmewIA6',
  'duration_ms': 205013,
  'time_signature': 4}]


                        
                            if os.path.exists('valence.csv'):
    df = pd.read_csv('valence.csv')
else:
    features = {}
    all_track_ids = list(all_charts['song_id'].unique())

    start = 0
    num_tracks = 100
    while start < len(all_track_ids):
        print(f'getting from {start} to {start+num_tracks}')
        tracks_batch = all_track_ids[start:start+num_tracks]
        features_batch = sp.audio_features(tracks_batch)
        features.update({ track_id : track_features 
                         for track_id, track_features in zip(tracks_batch, features_batch) })
        start += num_tracks

    # Let's confirm that we indeed got 13880 songs and turn the dictionary to a dataframe that we will use for our analysis.

    print(len(features))

    df = pd.DataFrame.from_dict(features, orient='index')
    df.to_csv('valence.csv', index=False)
    df


                        
                            sp.audio_analysis('3js3wKPw8VxBWtcXtwyUnA')

{'meta': {'analyzer_version': '4.0.0',
  'platform': 'Linux',
  'detailed_status': 'OK',
  'status_code': 0,
  'timestamp': 1573781674,
  'analysis_time': 86.85016,
  'input_process': 'libvorbisfile L+R 44100->22050'},
 'track': {'num_samples': 4320624,
  'duration': 195.94667,
  'sample_md5': '',
  'offset_seconds': 0,
  'window_seconds': 0,
  'analysis_sample_rate': 22050,
  'analysis_channels': 1,
  'end_of_fade_in': 0.5049,
  'start_of_fade_out': 191.48917,
  'loudness': -6.234,
  'tempo': 149.989,
  'tempo_confidence': 0.491,
  'time_signature': 4,
  'time_signature_confidence': 1.0,
  'key': 9,
  'key_confidence': 0.404,
  'mode': 1,
  'mode_confidence': 0.364,
  'codestring': '...',
  'code_version': 3.15,
  'echoprintstring': '...',
  'echoprint_version': 4.12,
  'synchstring': '...',
  'synch_version': 1.0,
  'rhythmstring': '...',
  'rhythm_version': 1.0},
 'bars': [{'start': 0.53103, 'duration': 1.60417, 'confidence': 0.649},
  {'start': 2.13519, 'duration': 1.60617, 'confidence': 0.453},
  {'start': 3.74136, 'duration': 1.59455, 'confidence': 0.596},
  {'start': 5.33591, 'duration': 1.59633, 'confidence': 0.123},
  {'start': 6.93224, 'duration': 1.60467, 'confidence': 0.826},
  {'start': 8.53691, 'duration': 1.59979, 'confidence': 0.263},
  {'start': 10.1367, 'duration': 1.60055, 'confidence': 0.751},
  {'start': 11.73725, 'duration': 0.39955, 'confidence': 0.484},
  {'start': 12.1368, 'duration': 1.59157, 'confidence': 0.381},
  ...],
 'sections': [{'start': 0.0,
   'duration': 12.1368,
   'confidence': 1.0,
   'loudness': -13.025,
   'tempo': 149.938,
   'tempo_confidence': 0.75,
   'key': 4,
   'key_confidence': 0.035,
   'mode': 1,
   'mode_confidence': 0.567,
   'time_signature': 4,
   'time_signature_confidence': 1.0},
  {'start': 12.1368,
   'duration': 13.18691,
   'confidence': 1.0,
   'loudness': -6.265,
   'tempo': 149.901,
   'tempo_confidence': 0.645,
   'key': 4,
   'key_confidence': 0.256,
   'mode': 1,
   'mode_confidence': 0.344,
   'time_signature': 4,
   'time_signature_confidence': 1.0},
  {'start': 25.32371,
   'duration': 22.00516,
   'confidence': 0.045,
   'loudness': -6.106,
   'tempo': 150.02,
   'tempo_confidence': 0.671,
   'key': 9,
   'key_confidence': 0.724,
   'mode': 1,
   'mode_confidence': 0.462,
   'time_signature': 4,
   'time_signature_confidence': 1.0},
  {'start': 47.32887,
   'duration': 32.39803,
   'confidence': 0.577,
   'loudness': -6.173,
   'tempo': 150.023,
   'tempo_confidence': 0.685,
   'key': 4,
   'key_confidence': 0.386,
   'mode': 1,
   'mode_confidence': 0.424,
   'time_signature': 4,
   'time_signature_confidence': 1.0},
  {'start': 79.72689,
   'duration': 23.58297,
   'confidence': 0.009,
   'loudness': -5.155,
   'tempo': 150.1,
   'tempo_confidence': 0.59,
   'key': 9,
   'key_confidence': 0.0,
   'mode': 1,
   'mode_confidence': 0.057,
   'time_signature': 4,
   'time_signature_confidence': 1.0},
  {'start': 103.30987,
   'duration': 13.21838,
   'confidence': 0.251,
   'loudness': -9.083,
   'tempo': 149.803,
   'tempo_confidence': 0.402,
   'key': 4,
   'key_confidence': 0.558,
   'mode': 1,
   'mode_confidence': 0.516,
   'time_signature': 4,
   'time_signature_confidence': 1.0},
  {'start': 116.52825,
   'duration': 12.79928,
   'confidence': 0.544,
   'loudness': -10.802,
   'tempo': 150.473,
   'tempo_confidence': 0.575,
   'key': 4,
   'key_confidence': 0.341,
   'mode': 1,
   'mode_confidence': 0.593,
   'time_signature': 4,
   'time_signature_confidence': 1.0},
...]


                        
                            %%timeit -o
a1 = sp.audio_analysis('3js3wKPw8VxBWtcXtwyUnA')

198 ms ± 8.33 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

<TimeitResult : 198 ms ± 8.33 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)>


                        
                            result = _


                        
                            print(f'We would need around {result.average * 13880 / 3600} hours to download the audio_analysis for every song needed in our dataset as the audio analysis breaks the songs in many pieces and produces various metrics about their beat, tempo, bars, sections, tatums, segments, etc. Also, we would need field specific knowledge to encode most of this data. For this reason, at least for now, we will skip collecting data from the audio_analysis function and focus on the data from the audio features. If later, we are not satisfied with the results of our models we could come back and add the audio analysis data into our dataset.')

We would need around 0.7634772212699664 hours to download the audio_analysis for every song needed in our dataset as the audio analysis breaks the songs in many pieces and produces various metrics about their beat, tempo, bars, sections, tatums, segments, etc. Also, we would need field specific knowledge to encode most of this data. For this reason, at least for now, we will skip collecting data from the audio_analysis function and focus on the data from the audio features. If later, we are not satisfied with the results of our models we could come back and add the audio analysis data into our dataset.

df


                        
                            df = df[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'tempo', 'duration_ms', 'time_signature', 'valence']]

df


                        
                            def process_subset(y, data, feature_set):
    X = data.loc[:, feature_set].values
    X = sm.add_constant(X)
    names = ['intercept']
    names.extend(feature_set)
    model = sm.OLS(y, X)
    model.data.xnames = names
    regr = model.fit()
    return regr


                        
                            import itertools

def get_best_of_k(y, data, k):
    
    best_rsquared = 0
    best_model = None
    for comb in itertools.combinations(data.columns, k):
        regr = process_subset(y, data, comb)
        if regr.rsquared > best_rsquared:
            best_rsquared = regr.rsquared
            best_model = regr

    return best_model


                        
                            def best_subset_selection(data, exog):
    best_model = None
    best_models = []
    y = data.loc[:, exog]
    endog = [ x for x in data.columns if x != exog ]
    X = data.loc[:, endog]

    for i in range(1, len(data.columns)):
        print(f'Finding the best model for {i} variable{"s" if i > 1 else ""}')
        model = get_best_of_k(y, X, i)
        if not best_model or model.rsquared_adj > best_model.rsquared_adj:
            best_model = model
        print(model.model.data.xnames[1:]) # get the variables minus the intercept
        best_models.append(model)

    print(f'Fitted {2**len(data.columns)} models')
    return best_model, best_models


                        
                            best_model, _ = best_subset_selection(df, 'valence')
print('Best overall model:', len(best_model.model.exog_names), best_model.model.exog_names)

Finding the best model for 1 variable
['energy']
Finding the best model for 2 variables
['danceability', 'energy']
Finding the best model for 3 variables
['danceability', 'energy', 'acousticness']
Finding the best model for 4 variables
['danceability', 'energy', 'acousticness', 'duration_ms']
Finding the best model for 5 variables
['danceability', 'energy', 'speechiness', 'acousticness', 'duration_ms']
Finding the best model for 6 variables
['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'duration_ms']
Finding the best model for 7 variables
['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'duration_ms']
Finding the best model for 8 variables
['danceability', 'energy', 'key', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'duration_ms']
Finding the best model for 9 variables
['danceability', 'energy', 'key', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'tempo', 'duration_ms']
Finding the best model for 10 variables
['danceability', 'energy', 'key', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'tempo', 'duration_ms']
Finding the best model for 11 variables
['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'tempo', 'duration_ms']
Finding the best model for 12 variables
['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'tempo', 'duration_ms', 'time_signature']
Fitted 8192 models
Best overall model: 12 ['intercept', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'tempo', 'duration_ms']


                        
                            best_model.summary()


                        
                            def forward_add_variable(data, exog, selected, to_select):
    best_rsquared = 0
    best_model = None
    best_column = None
    y = data.loc[:, exog]
    
    for column in to_select:
        new_selected = selected + [column]
        regr = process_subset(y, data, new_selected)
        if regr.rsquared > best_rsquared:
            best_rsquared = regr.rsquared
            best_model = regr
            best_column = column
    
    return best_model, best_column

def forward_stepwise_selection(data, exog):

    best_models = []
    best_model = None
    selected = []
    to_select = [ x for x in data.columns if x != exog ]

    p = len(to_select) + 1

    for i in range(1, p):
        print(f'Finding the best model for {i} variable{"s" if i > 1 else ""}')
        model, best_column = forward_add_variable(data, exog, selected, to_select)
        selected.append(best_column)
        to_select.remove(best_column)
        if not best_model or model.rsquared_adj > best_model.rsquared_adj:
            best_model = model
        print(selected)
        best_models.append(model)
        
    print(f'Fitted {1 + p*(p+1)//2} models')
    return best_model, best_models


                        
                            best_model, _ = forward_stepwise_selection(df, 'valence')
print('Best overall model:', len(best_model.model.exog_names), best_model.model.exog_names)

Finding the best model for 1 variable
['energy']
Finding the best model for 2 variables
['energy', 'danceability']
Finding the best model for 3 variables
['energy', 'danceability', 'acousticness']
Finding the best model for 4 variables
['energy', 'danceability', 'acousticness', 'duration_ms']
Finding the best model for 5 variables
['energy', 'danceability', 'acousticness', 'duration_ms', 'speechiness']
Finding the best model for 6 variables
['energy', 'danceability', 'acousticness', 'duration_ms', 'speechiness', 'instrumentalness']
Finding the best model for 7 variables
['energy', 'danceability', 'acousticness', 'duration_ms', 'speechiness', 'instrumentalness', 'liveness']
Finding the best model for 8 variables
['energy', 'danceability', 'acousticness', 'duration_ms', 'speechiness', 'instrumentalness', 'liveness', 'key']
Finding the best model for 9 variables
['energy', 'danceability', 'acousticness', 'duration_ms', 'speechiness', 'instrumentalness', 'liveness', 'key', 'tempo']
Finding the best model for 10 variables
['energy', 'danceability', 'acousticness', 'duration_ms', 'speechiness', 'instrumentalness', 'liveness', 'key', 'tempo', 'loudness']
Finding the best model for 11 variables
['energy', 'danceability', 'acousticness', 'duration_ms', 'speechiness', 'instrumentalness', 'liveness', 'key', 'tempo', 'loudness', 'mode']
Finding the best model for 12 variables
['energy', 'danceability', 'acousticness', 'duration_ms', 'speechiness', 'instrumentalness', 'liveness', 'key', 'tempo', 'loudness', 'mode', 'time_signature']
Fitted 92 models
Best overall model: 12 ['intercept', 'energy', 'danceability', 'acousticness', 'duration_ms', 'speechiness', 'instrumentalness', 'liveness', 'key', 'tempo', 'loudness', 'mode']


                        
                            best_model.summary()


                        
                            def backward_remove_variable(data, exog, selected):
    
    best_rsquared = 0
    best_model = None
    best_column = None
    y = data.loc[:, exog]
    
    for column in selected:
        new_selected = selected[:]
        new_selected.remove(column)
        regr = process_subset(y, data, new_selected)
        if regr.rsquared > best_rsquared:
            best_rsquared = regr.rsquared
            best_model = regr
            best_column = column
    
    return best_model, best_column

def backward_stepwise_selection(data, exog):

    best_models = []
    selected = [ x for x in data.columns if x != exog ]

    p = len(selected) + 1

    print(f'Finding the best model for {p - 1} variables')
    print(selected)
    y = data.loc[:, exog]
    best_model = process_subset(y, data, selected)
    best_models.append(best_model)

    for i in reversed(range(2, p)):
        print(f'Finding the best model for {i - 1} variable{"s" if (i - 1) > 1 else ""}')
        model, best_column = backward_remove_variable(data, exog, selected)
        selected.remove(best_column)
        if not best_model or model.rsquared_adj > best_model.rsquared_adj:
            best_model = model
        print(selected)
        best_models.append(model)
        
    print(f'Fitted {1 + p*(p+1)//2} models')
    return best_model, best_models


                        
                            best_model, _ = backward_stepwise_selection(df, 'valence')
print('Best overall model:', len(best_model.model.exog_names), best_model.model.exog_names)

Finding the best model for 12 variables
['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'tempo', 'duration_ms', 'time_signature']
Finding the best model for 11 variables
['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'tempo', 'duration_ms']
Finding the best model for 10 variables
['danceability', 'energy', 'key', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'tempo', 'duration_ms']
Finding the best model for 9 variables
['danceability', 'energy', 'key', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'tempo', 'duration_ms']
Finding the best model for 8 variables
['danceability', 'energy', 'key', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'duration_ms']
Finding the best model for 7 variables
['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'duration_ms']
Finding the best model for 6 variables
['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'duration_ms']
Finding the best model for 5 variables
['danceability', 'energy', 'speechiness', 'acousticness', 'duration_ms']
Finding the best model for 4 variables
['danceability', 'energy', 'acousticness', 'duration_ms']
Finding the best model for 3 variables
['danceability', 'energy', 'acousticness']
Finding the best model for 2 variables
['danceability', 'energy']
Finding the best model for 1 variable
['energy']
Fitted 92 models
Best overall model: 12 ['intercept', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'tempo', 'duration_ms']


                        
                            best_model.summary()


                        
                            df = df[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'tempo', 'duration_ms', 'time_signature', 'valence']]
corr_matr = df.corr()


                        
                            plt.figure(figsize=(20,20))
sns.heatmap(corr_matr, annot=True)
plt.show()


                        
                            import sklearn
import pandas as pd
import numpy as np

np.random.seed(42)
%matplotlib inline

import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import optimizers


                        
                            from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV


                        
                            X = df[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'tempo', 'duration_ms']]


                        
                            y = df['valence']


                        
                            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)


                        
                            model = DecisionTreeRegressor()


                        
                            model.fit(X_train, y_train)

DecisionTreeRegressor()


                        
                            cross_val_score(model, X_test, y_test, cv=10)

array([-0.40634691, -0.36018218, -0.40427474, -0.37538865, -0.53283661,
       -0.61615335, -0.3642965 , -0.09482089, -0.37924784, -0.26051442])


                        
                            sorted(sklearn.metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_weighted',
 'top_k_accuracy',
 'v_measure_score']


                        
                            -np.mean(cross_val_score(model, X_test, y_test, cv=10, scoring='neg_mean_absolute_error'))

0.19187170324390307


                        
                            from sklearn.metrics import mean_absolute_error


                        
                            y_pred = model.predict(X_test)
mean_absolute_error(y_test, y_pred)

0.17605965417867436


                        
                            from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mean_absolute_error(y_test, y_pred)

0.13242974531700288


                        
                            from sklearn.compose import ColumnTransformer

cols = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'tempo', 'duration_ms']

pipe = Pipeline([
    ("selector", ColumnTransformer([
        ("selector", "passthrough", cols)
    ], remainder="drop")),
    ('scaler', StandardScaler()),
    ('rgr', RandomForestRegressor(random_state=42))
])

param_grid = {
    'rgr__n_estimators': [250],
    'rgr__max_depth': [5, 10, 20]
}

cv = KFold(n_splits=5, shuffle=True, random_state=42)

model_forest = GridSearchCV(pipe, param_grid, n_jobs=2, cv=cv)

model_forest.fit(X_train, y_train)

GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('selector',
                                        ColumnTransformer(transformers=[('selector',
                                                                         'passthrough',
                                                                         ['danceability',
                                                                          'energy',
                                                                          'key',
                                                                          'loudness',
                                                                          'mode',
                                                                          'speechiness',
                                                                          'acousticness',
                                                                          'instrumentalness',
                                                                          'liveness',
                                                                          'tempo',
                                                                          'duration_ms'])])),
                                       ('scaler', StandardScaler()),
                                       ('rgr',
                                        RandomForestRegressor(random_state=42))]),
             n_jobs=2,
             param_grid={'rgr__max_depth': [5, 10, 20],
                         'rgr__n_estimators': [250]})


                        
                            model_forest.best_estimator_

Pipeline(steps=[('selector',
                 ColumnTransformer(transformers=[('selector', 'passthrough',
                                                  ['danceability', 'energy',
                                                   'key', 'loudness', 'mode',
                                                   'speechiness',
                                                   'acousticness',
                                                   'instrumentalness',
                                                   'liveness', 'tempo',
                                                   'duration_ms'])])),
                ('scaler', StandardScaler()),
                ('rgr',
                 RandomForestRegressor(max_depth=20, n_estimators=250,
                                       random_state=42))])


                        
                            y_pred = model_forest.predict(X_test)
mean_absolute_error(y_test, y_pred)

0.13238445079608177


                        
                            rgr = model_forest.best_estimator_.named_steps.rgr

importances = rgr.feature_importances_

std = np.std([tree.feature_importances_ for tree in rgr.estimators_],
             axis=0)

indices = np.argsort(importances)[::-1]

for f in range(len(df.columns)-2):
    print("%d. feature %d %s (%f)" % (f + 1, indices[f], 
                                      df.columns[indices[f]], 
                                      importances[indices[f]]))

1. feature 1 energy (0.214897)
2. feature 0 danceability (0.142935)
3. feature 6 acousticness (0.102288)
4. feature 8 liveness (0.091427)
5. feature 5 speechiness (0.090614)
6. feature 10 duration_ms (0.089188)
7. feature 9 tempo (0.089082)
8. feature 3 loudness (0.080569)
9. feature 7 instrumentalness (0.048198)
10. feature 2 key (0.040918)
11. feature 4 mode (0.009884)


                        
                            plt.figure(figsize=(15, 10))
plt.title("Feature importances")
plt.bar(range(len(df.columns)-2), importances[indices],
        tick_label=[df.columns[x] for x in indices],
        color="r", yerr=std[indices], align="center")
plt.show()


                        
                            import xgboost as xgb

xgb_reg = xgb.XGBRegressor()

sorted(xgb_reg.get_params().keys())

['base_score',
 'booster',
 'colsample_bylevel',
 'colsample_bynode',
 'colsample_bytree',
 'enable_categorical',
 'gamma',
 'gpu_id',
 'importance_type',
 'interaction_constraints',
 'learning_rate',
 'max_delta_step',
 'max_depth',
 'min_child_weight',
 'missing',
 'monotone_constraints',
 'n_estimators',
 'n_jobs',
 'num_parallel_tree',
 'objective',
 'predictor',
 'random_state',
 'reg_alpha',
 'reg_lambda',
 'scale_pos_weight',
 'subsample',
 'tree_method',
 'validate_parameters',
 'verbosity']


                        
                            xgb_reg.fit(X_train, y_train)
mean_absolute_error(y_test, xgb_reg.predict(X_test))

0.1408632092074274


                        
                            pipe = Pipeline([
    ("selector", ColumnTransformer([
        ("selector", "passthrough", cols)
    ], remainder="drop")),
    ('rgr', xgb_reg)
])

param_grid = {
    'n_estimators': [50, 100],
    'tree_method': ['auto', 'exact', 'approx', 'hist'],
    'max_depth': list(range(5, 11))
}

cv = KFold(n_splits=5, shuffle=True, random_state=42)

model_xgb = GridSearchCV(xgb_reg, param_grid, n_jobs=2, cv=cv)

model_xgb.fit(X_train, y_train)

GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1,
                                    enable_categorical=False, gamma=0,
                                    gpu_id=-1, importance_type=None,
                                    interaction_constraints='',
                                    learning_rate=0.300000012, max_delta_step=0,
                                    max_depth=6, min_child_weight=1,
                                    missing=nan, monotone_constraints='()',
                                    n_estimators=100, n_jobs=4,
                                    num_parallel_tree=1, predictor='auto',
                                    random_state=0, reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, subsample=1,
                                    tree_method='exact', validate_parameters=1,
                                    verbosity=None),
             n_jobs=2,
             param_grid={'max_depth': [5, 6, 7, 8, 9, 10],
                         'n_estimators': [50, 100],
                         'tree_method': ['auto', 'exact', 'approx', 'hist']})


                        
                            model_xgb.best_params_

{'max_depth': 5, 'n_estimators': 100, 'tree_method': 'approx'}


                        
                            mean_absolute_error(y_test, model_xgb.predict(X_test))

0.14204629986001882


                        
                            fig = plt.figure(figsize=(8, 6))
xgb.plot_importance(model_xgb.best_estimator_, ax = fig.gca())

<AxesSubplot:title={'center':'Feature importance'}, xlabel='F score', ylabel='Features'>


                        
                            from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras import regularizers

BATCH_SIZE = 128
EPOCHS = 100

normalizer = preprocessing.Normalization()
normalizer.adapt(np.array(X_train))

initial_learning_rate = 0.001
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=100000,
    decay_rate=0.96,
    staircase=True)

def build_compile_model():
    model = keras.Sequential([
        normalizer,
        layers.Dense(11, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
        layers.Dropout(0.2),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(1)
    ])

    model.compile(loss='mean_absolute_error',
                optimizer=optimizers.Adam(learning_rate=lr_schedule))
    
    return model

model_nn = build_compile_model()


                        
                            history = model_nn.fit(X_train, 
             y_train, 
             batch_size=BATCH_SIZE, 
             epochs=EPOCHS, 
             verbose=1,
             validation_split=0.2)

Epoch 1/100
70/70 [==============================] - 1s 6ms/step - loss: 0.2668 - val_loss: 0.2065
Epoch 2/100
70/70 [==============================] - 1s 8ms/step - loss: 0.1953 - val_loss: 0.1894
Epoch 3/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1882 - val_loss: 0.1758
Epoch 4/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1813 - val_loss: 0.1734
Epoch 5/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1783 - val_loss: 0.1708
Epoch 6/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1724 - val_loss: 0.1709
Epoch 7/100
70/70 [==============================] - 0s 7ms/step - loss: 0.1708 - val_loss: 0.1667
Epoch 8/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1697 - val_loss: 0.1660
Epoch 9/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1665 - val_loss: 0.1635
Epoch 10/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1670 - val_loss: 0.1631
Epoch 11/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1654 - val_loss: 0.1627
Epoch 12/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1638 - val_loss: 0.1639
Epoch 13/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1634 - val_loss: 0.1600
Epoch 14/100
70/70 [==============================] - 0s 7ms/step - loss: 0.1609 - val_loss: 0.1587
Epoch 15/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1610 - val_loss: 0.1591
Epoch 16/100
70/70 [==============================] - 1s 7ms/step - loss: 0.1610 - val_loss: 0.1582
Epoch 17/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1594 - val_loss: 0.1577
Epoch 18/100
70/70 [==============================] - 1s 8ms/step - loss: 0.1589 - val_loss: 0.1584
Epoch 19/100
70/70 [==============================] - 1s 9ms/step - loss: 0.1580 - val_loss: 0.1575
Epoch 20/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1587 - val_loss: 0.1575
Epoch 21/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1575 - val_loss: 0.1573
Epoch 22/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1575 - val_loss: 0.1590
Epoch 23/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1571 - val_loss: 0.1556
Epoch 24/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1564 - val_loss: 0.1553
Epoch 25/100
70/70 [==============================] - 0s 7ms/step - loss: 0.1559 - val_loss: 0.1554
Epoch 26/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1552 - val_loss: 0.1568
Epoch 27/100
70/70 [==============================] - ETA: 0s - loss: 0.155 - 0s 6ms/step - loss: 0.1558 - val_loss: 0.1557
Epoch 28/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1557 - val_loss: 0.1609
Epoch 29/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1560 - val_loss: 0.1550
Epoch 30/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1545 - val_loss: 0.1548
Epoch 31/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1550 - val_loss: 0.1556
Epoch 32/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1544 - val_loss: 0.1541
Epoch 33/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1547 - val_loss: 0.1541
Epoch 34/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1533 - val_loss: 0.1548
Epoch 35/100
70/70 [==============================] - 0s 4ms/step - loss: 0.1542 - val_loss: 0.1577
Epoch 36/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1546 - val_loss: 0.1541
Epoch 37/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1535 - val_loss: 0.1545
Epoch 38/100
70/70 [==============================] - 0s 4ms/step - loss: 0.1535 - val_loss: 0.1564
Epoch 39/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1531 - val_loss: 0.1551
Epoch 40/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1536 - val_loss: 0.1546
Epoch 41/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1527 - val_loss: 0.1553
Epoch 42/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1519 - val_loss: 0.1538
Epoch 43/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1530 - val_loss: 0.1542
Epoch 44/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1530 - val_loss: 0.1537
Epoch 45/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1528 - val_loss: 0.1545
Epoch 46/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1522 - val_loss: 0.1529
Epoch 47/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1516 - val_loss: 0.1541
Epoch 48/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1521 - val_loss: 0.1539
Epoch 49/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1523 - val_loss: 0.1535
Epoch 50/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1518 - val_loss: 0.1543
Epoch 51/100
70/70 [==============================] - 1s 8ms/step - loss: 0.1516 - val_loss: 0.1534
Epoch 52/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1515 - val_loss: 0.1528
Epoch 53/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1513 - val_loss: 0.1529
Epoch 54/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1516 - val_loss: 0.1524
Epoch 55/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1514 - val_loss: 0.1546
Epoch 56/100
70/70 [==============================] - 1s 8ms/step - loss: 0.1510 - val_loss: 0.1529
Epoch 57/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1519 - val_loss: 0.1521
Epoch 58/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1511 - val_loss: 0.1535
Epoch 59/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1511 - val_loss: 0.1531
Epoch 60/100
70/70 [==============================] - 1s 8ms/step - loss: 0.1512 - val_loss: 0.1528
Epoch 61/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1502 - val_loss: 0.1526
Epoch 62/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1504 - val_loss: 0.1524
Epoch 63/100
70/70 [==============================] - 0s 7ms/step - loss: 0.1514 - val_loss: 0.1518
Epoch 64/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1509 - val_loss: 0.1547
Epoch 65/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1505 - val_loss: 0.1530
Epoch 66/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1515 - val_loss: 0.1546
Epoch 67/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1508 - val_loss: 0.1535
Epoch 68/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1513 - val_loss: 0.1559
Epoch 69/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1511 - val_loss: 0.1527
Epoch 70/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1504 - val_loss: 0.1556
Epoch 71/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1507 - val_loss: 0.1523
Epoch 72/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1508 - val_loss: 0.1532
Epoch 73/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1506 - val_loss: 0.1526
Epoch 74/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1499 - val_loss: 0.1526
Epoch 75/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1502 - val_loss: 0.1528
Epoch 76/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1501 - val_loss: 0.1520
Epoch 77/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1501 - val_loss: 0.1515
Epoch 78/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1503 - val_loss: 0.1529
Epoch 79/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1505 - val_loss: 0.1537
Epoch 80/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1502 - val_loss: 0.1535
Epoch 81/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1498 - val_loss: 0.1533
Epoch 82/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1506 - val_loss: 0.1531
Epoch 83/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1499 - val_loss: 0.1528
Epoch 84/100
70/70 [==============================] - 0s 5ms/step - loss: 0.1496 - val_loss: 0.1542
Epoch 85/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1498 - val_loss: 0.1536
Epoch 86/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1498 - val_loss: 0.1528
Epoch 87/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1496 - val_loss: 0.1533
Epoch 88/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1498 - val_loss: 0.1514
Epoch 89/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1502 - val_loss: 0.1548
Epoch 90/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1500 - val_loss: 0.1516
Epoch 91/100
70/70 [==============================] - 0s 7ms/step - loss: 0.1506 - val_loss: 0.1536
Epoch 92/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1502 - val_loss: 0.1530
Epoch 93/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1490 - val_loss: 0.1523
Epoch 94/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1494 - val_loss: 0.1519
Epoch 95/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1504 - val_loss: 0.1543
Epoch 96/100
70/70 [==============================] - 0s 7ms/step - loss: 0.1497 - val_loss: 0.1545
Epoch 97/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1498 - val_loss: 0.1559
Epoch 98/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1496 - val_loss: 0.1527
Epoch 99/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1496 - val_loss: 0.1524
Epoch 100/100
70/70 [==============================] - 0s 6ms/step - loss: 0.1496 - val_loss: 0.1525


                        
                            def plot_loss(history):
    plt.plot(history.history['loss'], label='loss')
    plt.plot(history.history['val_loss'], label='val_loss')
    plt.xlabel('Epoch')
    plt.ylabel('Error')
    plt.legend()
    plt.grid(True)


                        
                            plot_loss(history)


                        
                            model_nn.summary()

Model: "sequential_86"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 normalization_77 (Normaliza  (None, 11)               23        
 tion)                                                           
                                                                 
 dense_295 (Dense)           (None, 11)                132       
                                                                 
 dropout_43 (Dropout)        (None, 11)                0         
                                                                 
 dense_296 (Dense)           (None, 256)               3072      
                                                                 
 dropout_44 (Dropout)        (None, 256)               0         
                                                                 
 dense_297 (Dense)           (None, 128)               32896     
                                                                 
 dropout_45 (Dropout)        (None, 128)               0         
                                                                 
 dense_298 (Dense)           (None, 64)                8256      
                                                                 
 dropout_46 (Dropout)        (None, 64)                0         
                                                                 
 dense_299 (Dense)           (None, 1)                 65        
                                                                 
=================================================================
Total params: 44,444
Trainable params: 44,421
Non-trainable params: 23
_________________________________________________________________


                        
                            ids = []
with open('spotify_ids.txt') as f:
    for line in f:
        ids.append(line[:-1])

ids

['7lPN2DXiMsVn7XUKtOW1CS',
 '5QO79kh1waicV47BqGRL3g',
 '0VjIjW4GlUZAMYd2vXMi3b',
 '4MzXwWMhyBbmu6hOcLVD49',
 '5Kskr9LcNYa0tpt5f0ZEJx',
 '6tDDoYIxWvMLTdKpjFkc1B',
 '3VT8hOC5vuDXBsHrR53WFh',
 '1xK1Gg9SxG8fy2Ya373oqb',
 '6f3Slt0GbA2bPZlz0aIFXN',
 '3tjFYV6RSFtuktYl3ZtYcq',
 '27OeeYzk6klgBh83TSvGMA',
 '2XIc1pqjXV3Cr2BQUGNBck',
 '60ynsPSSKe6O3sfwRnIBRf',
 '1M4OcYkxAtu3ErzSgDEfoi',
 '3YJJjQPAbDT7mGpX3WtQ9A',
 '5nujrmhLynf4yMoMtj8AQF',
 '1t9WgS8FN0534tLBRwbaxO',
 '7vrJn5hDSXRmdXoR30KgF1',
 '4saklk6nie3yiGePpBwUoc',
 '3FAJ6O0NOHQV8Mc5Ri6ENp',
 '0D75ciM842cdUMKSMfAR9y',
 '35mvY5S1H3J2QZyna3TFe0',
 '6Im9k8u9iIzKMrmV7BWtlF',
 '5YYW3yRktprLRr47WK219Y',
 '7hxHWCCAIIxFLCzvDgnQHX',
 '3VvA1wSxukMLsvXoXtlwWx',
 '1diS6nkxMQc3wwC4G1j0bh',
 '6ft4hAq6yde8jPZY2i5zLr',
 '7qEHsqek33rTcFNT9PFqLf',
 '31qCy5ZaophVA81wtlwLc4',
 '3iw6V4LH7yPj1ESORX9RIN',
 '45bE4HXI0AwGZXfZtMp8JR',
 '02MWAaffLxlfxAUY7c5dvx',
 '1tkg4EHVoqnhR6iFEXb60y',
 '5uEYRdEIh9Bo4fpjDd4Na9',
 '6UelLqGlWMcVH1E5c4H7lY',
 '1J14CdDAvBTE1AJYUOwl6C',
 '6cx06DFPPHchuUAcTxznu9',
 '5YaskwnGDZFDRipaqzbwQx',
 '54bFM56PmE4YLRnqpW6Tha',
 '2vBET2pmrQqafaS6zIaYta',
 '7ytR5pFWmSjzHJIeQkgog4',
 '2Oycxb8QbPkpHTo8ZrmG0B',
 '1yoMvmasuxZfqHEipJhRbp',
 '1rgnBhdG2JDFTbYkYRZAku',
 '2QjOHCTQ1Jl3zawyYOpxh6',
 '4Oun2ylbjFKMPTiaSbbCih',
 ...]


                        
                            features = {}
start = 0
num_tracks = 100

while start < len(ids):
    print(f'getting from {start} to {start+num_tracks}')
    tracks_batch = ids[start:start+num_tracks]
    features_batch = sp.audio_features(tracks_batch)
    features.update({ track_id : track_features 
                     for track_id, track_features in zip(tracks_batch, features_batch) })
    start += num_tracks

print(len(features))

df_test = pd.DataFrame.from_dict(features, orient='index')
df_test

getting from 0 to 100
getting from 100 to 200
getting from 200 to 300
getting from 300 to 400
getting from 400 to 500
getting from 500 to 600
getting from 600 to 700
getting from 700 to 800
getting from 800 to 900
getting from 900 to 1000
getting from 1000 to 1100
getting from 1100 to 1200
1162


                        
                            y_pred = model_forest.predict(df_test)


                        
                            mean_absolute_error(df_test['valence'], y_pred)

0.14071737286609584

	position	song_id	song_name	artist	streams	last_week_position	weeks_on_chart	peak_position	position_status	week_start	week_end	region
0	1	5aAx2yezTd8zXrkmtKl66Z	Starboy	The Weeknd	25734078	NaN	1	1	new	2016-12-30	2017-01-06	global
1	2	7BKLCZ1jbUBVqRi2FVlTVw	Closer	The Chainsmokers	23519705	NaN	1	2	new	2016-12-30	2017-01-06	global
2	3	5knuzwU65gJK7IF5yJsuaW	Rockabye (feat. Sean Paul & Anne-Marie)	Clean Bandit	21216399	NaN	1	3	new	2016-12-30	2017-01-06	global
3	4	4pdPtRcBmOSQDlJ3Fk945m	Let Me Love You	DJ Snake	19852704	NaN	1	4	new	2016-12-30	2017-01-06	global
4	5	3NdDpSvN911VPGivFlV5d0	I Don’t Wanna Live Forever (Fifty Shades Darke...	ZAYN	18316326	NaN	1	5	new	2016-12-30	2017-01-06	global
...	...	...	...	...	...	...	...	...	...	...	...	...
195	196	2nMZx7QHerfo4Wv37xNUEC	Wrapped in Red	Kelly Clarkson	291064	NaN	1	196	new	2019-12-20	2019-12-27	ca
196	197	0e7ipj03S05BNilyu5bRzt	rockstar (feat. 21 Savage)	Post Malone	290360	158.0	87	7	-39	2019-12-20	2019-12-27	ca
197	198	0u2P5u6lvoDfwTYjAADbn4	lovely (with Khalid)	Billie Eilish	289022	170.0	66	29	-28	2019-12-20	2019-12-27	ca
198	199	30SNjazZhzunhAWCjhdyyD	Christmas All Over Again	Tom Petty and the Heartbreakers	288611	NaN	1	199	new	2019-12-20	2019-12-27	ca
199	200	7FGq80cy8juXBCD2nrqdWU	Eastside (with Halsey & Khalid)	benny blanco	288079	136.0	55	11	-64	2019-12-20	2019-12-27	ca

	danceability	energy	key	loudness	mode	speechiness	acousticness	instrumentalness	liveness	tempo	duration_ms	time_signature	valence
0	0.681	0.594	7	-7.028	1	0.2820	0.1650	0.000003	0.134	186.054	230453	4	0.5350
1	0.748	0.524	8	-5.599	1	0.0338	0.4140	0.000000	0.111	95.010	244960	4	0.6610
2	0.720	0.763	9	-4.068	0	0.0523	0.4060	0.000000	0.180	101.965	251088	4	0.7420
3	0.476	0.718	8	-5.309	1	0.0576	0.0784	0.000010	0.122	199.864	205947	4	0.1420
4	0.735	0.451	0	-8.374	1	0.0585	0.0631	0.000013	0.325	117.973	245200	4	0.0862
...	...	...	...	...	...	...	...	...	...	...	...	...	...
13875	0.718	0.882	4	-4.677	0	0.0521	0.0229	0.028600	0.102	95.005	194171	4	0.6110
13876	0.509	0.307	1	-9.616	0	0.0307	0.8740	0.000000	0.701	85.471	143467	4	0.3920
13877	0.533	0.436	9	-9.542	0	0.0289	0.4500	0.000000	0.102	103.477	215160	3	0.1690
13878	0.544	0.593	5	-6.313	0	0.0728	0.4990	0.000000	0.159	135.245	218920	3	0.4940
13879	0.540	0.734	7	-4.668	1	0.0366	0.3650	0.000000	0.132	123.877	216907	4	0.3600

	danceability	energy	key	loudness	mode	speechiness	acousticness	instrumentalness	liveness	tempo	duration_ms	time_signature	valence
0	0.681	0.594	7	-7.028	1	0.2820	0.1650	0.000003	0.134	186.054	230453	4	0.5350
1	0.748	0.524	8	-5.599	1	0.0338	0.4140	0.000000	0.111	95.010	244960	4	0.6610
2	0.720	0.763	9	-4.068	0	0.0523	0.4060	0.000000	0.180	101.965	251088	4	0.7420
3	0.476	0.718	8	-5.309	1	0.0576	0.0784	0.000010	0.122	199.864	205947	4	0.1420
4	0.735	0.451	0	-8.374	1	0.0585	0.0631	0.000013	0.325	117.973	245200	4	0.0862
...	...	...	...	...	...	...	...	...	...	...	...	...	...
13875	0.718	0.882	4	-4.677	0	0.0521	0.0229	0.028600	0.102	95.005	194171	4	0.6110
13876	0.509	0.307	1	-9.616	0	0.0307	0.8740	0.000000	0.701	85.471	143467	4	0.3920
13877	0.533	0.436	9	-9.542	0	0.0289	0.4500	0.000000	0.102	103.477	215160	3	0.1690
13878	0.544	0.593	5	-6.313	0	0.0728	0.4990	0.000000	0.159	135.245	218920	3	0.4940
13879	0.540	0.734	7	-4.668	1	0.0366	0.3650	0.000000	0.132	123.877	216907	4	0.3600

Dep. Variable:	valence	R-squared:	0.236
Model:	OLS	Adj. R-squared:	0.236
Method:	Least Squares	F-statistic:	389.9
Date:	Sun, 09 Jan 2022	Prob (F-statistic):	0.00
Time:	17:19:25	Log-Likelihood:	3542.1
No. Observations:	13880	AIC:	-7060.
Df Residuals:	13868	BIC:	-6970.
Df Model:	11
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
intercept	-0.2024	0.022	-9.350	0.000	-0.245	-0.160
danceability	0.3999	0.012	33.411	0.000	0.376	0.423
energy	0.6026	0.015	39.456	0.000	0.573	0.633
key	0.0022	0.000	4.845	0.000	0.001	0.003
loudness	-0.0024	0.001	-2.343	0.019	-0.004	-0.000
mode	0.0054	0.003	1.633	0.103	-0.001	0.012
speechiness	-0.1172	0.013	-9.302	0.000	-0.142	-0.092
acousticness	0.1733	0.008	22.462	0.000	0.158	0.188
instrumentalness	-0.1588	0.027	-5.935	0.000	-0.211	-0.106
liveness	0.0487	0.011	4.540	0.000	0.028	0.070
tempo	0.0002	5.72e-05	3.994	0.000	0.000	0.000
duration_ms	-2.897e-07	3.27e-08	-8.862	0.000	-3.54e-07	-2.26e-07

Dissecting Spotify Valence

Untangling the mystery behind Spotify's valence metric and proposing how it is derived

Dissecting Spotify Valence¶

Questions¶

Q1: Expore which Track Features Influence Valence¶

Q2: Predict Valence¶

Random Forest Regressor¶

XGBooster Regressor¶

Neural Network¶

Run Random Forest on Test Data¶

Omnibus:	140.840	Durbin-Watson:	1.834
Prob(Omnibus):	0.000	Jarque-Bera (JB):	89.787
Skew:	0.014	Prob(JB):	3.18e-20
Kurtosis:	2.607	Cond. No.	3.63e+06

	danceability	energy	key	loudness	mode	speechiness	acousticness	instrumentalness	liveness	valence	tempo	type	id	uri	track_href	analysis_url	duration_ms	time_signature
7lPN2DXiMsVn7XUKtOW1CS	0.585	0.436	10	-8.761	1	0.0601	0.72100	0.000013	0.1050	0.132	143.874	audio_features	7lPN2DXiMsVn7XUKtOW1CS	spotify:track:7lPN2DXiMsVn7XUKtOW1CS	https://api.spotify.com/v1/tracks/7lPN2DXiMsVn...	https://api.spotify.com/v1/audio-analysis/7lPN...	242014	4
5QO79kh1waicV47BqGRL3g	0.680	0.826	0	-5.487	1	0.0309	0.02120	0.000012	0.5430	0.644	118.051	audio_features	5QO79kh1waicV47BqGRL3g	spotify:track:5QO79kh1waicV47BqGRL3g	https://api.spotify.com/v1/tracks/5QO79kh1waic...	https://api.spotify.com/v1/audio-analysis/5QO7...	215627	4
0VjIjW4GlUZAMYd2vXMi3b	0.514	0.730	1	-5.934	1	0.0598	0.00146	0.000095	0.0897	0.334	171.005	audio_features	0VjIjW4GlUZAMYd2vXMi3b	spotify:track:0VjIjW4GlUZAMYd2vXMi3b	https://api.spotify.com/v1/tracks/0VjIjW4GlUZA...	https://api.spotify.com/v1/audio-analysis/0VjI...	200040	4
4MzXwWMhyBbmu6hOcLVD49	0.731	0.573	4	-10.059	0	0.0544	0.40100	0.000052	0.1130	0.145	109.928	audio_features	4MzXwWMhyBbmu6hOcLVD49	spotify:track:4MzXwWMhyBbmu6hOcLVD49	https://api.spotify.com/v1/tracks/4MzXwWMhyBbm...	https://api.spotify.com/v1/audio-analysis/4MzX...	205090	4
5Kskr9LcNYa0tpt5f0ZEJx	0.907	0.393	4	-7.636	0	0.0539	0.45100	0.000001	0.1350	0.202	104.949	audio_features	5Kskr9LcNYa0tpt5f0ZEJx	spotify:track:5Kskr9LcNYa0tpt5f0ZEJx	https://api.spotify.com/v1/tracks/5Kskr9LcNYa0...	https://api.spotify.com/v1/audio-analysis/5Ksk...	205458	4
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
4lUmnwRybYH7mMzf16xB0y	0.596	0.650	9	-5.167	1	0.3370	0.13800	0.000000	0.1400	0.188	133.997	audio_features	4lUmnwRybYH7mMzf16xB0y	spotify:track:4lUmnwRybYH7mMzf16xB0y	https://api.spotify.com/v1/tracks/4lUmnwRybYH7...	https://api.spotify.com/v1/audio-analysis/4lUm...	257428	4
1fzf9Aad4y1RWrmwosAK5y	0.588	0.850	4	-6.431	1	0.0318	0.16800	0.002020	0.0465	0.768	93.003	audio_features	1fzf9Aad4y1RWrmwosAK5y	spotify:track:1fzf9Aad4y1RWrmwosAK5y	https://api.spotify.com/v1/tracks/1fzf9Aad4y1R...	https://api.spotify.com/v1/audio-analysis/1fzf...	187310	4
3E3pb3qH11iny6TFDJvsg5	0.754	0.660	0	-6.811	1	0.2670	0.17900	0.000000	0.1940	0.316	83.000	audio_features	3E3pb3qH11iny6TFDJvsg5	spotify:track:3E3pb3qH11iny6TFDJvsg5	https://api.spotify.com/v1/tracks/3E3pb3qH11in...	https://api.spotify.com/v1/audio-analysis/3E3p...	209299	4
3yTkoTuiKRGL2VAlQd7xsC	0.584	0.836	0	-4.925	1	0.0790	0.05580	0.000000	0.0663	0.484	104.973	audio_features	3yTkoTuiKRGL2VAlQd7xsC	spotify:track:3yTkoTuiKRGL2VAlQd7xsC	https://api.spotify.com/v1/tracks/3yTkoTuiKRGL...	https://api.spotify.com/v1/audio-analysis/3yTk...	202204	4
4JE6agBLHGA5TaF6FlqfBD	0.331	0.450	10	-5.362	1	0.0340	0.36500	0.004640	0.2200	0.180	123.829	audio_features	4JE6agBLHGA5TaF6FlqfBD	spotify:track:4JE6agBLHGA5TaF6FlqfBD	https://api.spotify.com/v1/tracks/4JE6agBLHGA5...	https://api.spotify.com/v1/audio-analysis/4JE6...	218755	4