import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import seaborn as sb


songs = pd.read_csv('archive/tracks.csv')
songs.head()


drop_columns = ['name', 'artists', 'id', 'explicit', 'id_artists', 'key', 'loudness', 'mode', 'instrumentalness', 'liveness', 'time_signature']
songs = songs.drop(drop_columns, axis=1)
songs = songs.rename(columns ={'popularity': 'Popularity', 'duration_ms':'Duration', 'release_date': 'Year', 'danceability': 'Danceability', 'energy': 'Energy', 'speechiness': 'Speechiness', 'acousticness': 'Acousticness', 'valence': 'Valence', 'tempo': 'Tempo'})
songs.head()


for index, rows in songs.iterrows():
    s = songs.at[index, 'Year']
    songs.at[index, 'Year'] = int(s[0:4])


songs.head()


ax = songs['Popularity'].plot.hist(title='Distribution of Popularity', bins=20)
ax.set(xlabel='Popularity', ylabel='Frequency')

[Text(0.5, 0, 'Popularity'), Text(0, 0.5, 'Frequency')]


songs = songs[songs['Popularity'] > 0]
ax = songs['Popularity'].plot.hist(title='Distribution of Popularity', bins=20)
ax.set(xlabel='Popularity', ylabel='Frequency')

[Text(0.5, 0, 'Popularity'), Text(0, 0.5, 'Frequency')]


no_plot_array = ['Popularity', 'Year']
rand = songs.sample(n = 2500)
for column, items in rand.iteritems():
    exists = column in no_plot_array
    if exists == False:
        plt.scatter(x = column, y = 'Popularity', data = rand)
        plt.title("Popularity vs. {}".format(column))
        plt.xlabel('Popularity')
        plt.ylabel('{}'.format(column))
        plt.show()


subset = songs[['Popularity', 'Energy', 'Danceability', 'Valence', 'Tempo', 'Acousticness']]
subset.corr(method='spearman')


new_subset = rand[['Energy', 'Danceability', 'Acousticness']]
fig, ax = plt.subplots(1, 3, sharey=True, tight_layout=True)
fig.suptitle("Deviation From Linear Regression Model")

for s in new_subset:
    y = rand['Popularity'].to_numpy().reshape((-1,1))
    X = rand[s].to_numpy().reshape((-1,1))
    

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    reg = LinearRegression()

    reg.fit(X_train, y_train)
    
    intercept = reg.intercept_
    coefficient = reg.coef_
    r_squared = reg.score(X_test, y_test)
    pred = reg.predict(X_test)
    mse = mean_squared_error(y_test, pred)

    print("Regression analysis for Popularity vs. {} ".format(s))
    print("Intercept: \n", intercept)
    print("Slope: \n", coefficient)
    print("Coefficient of Determination: \n", r_squared)
    print("Mean Squared Error: \n", mse)
    print("\n\n")
    
    rand['{} Residuals'.format(s)] = rand['Popularity'] - (intercept[0] + coefficient[0,0]*rand[s])
    i = 0
    if s == 'Energy':
        i = 0
    elif s == 'Danceability':
        i = 1
    else:
        i = 2
    
    ax[i].hist(rand['{} Residuals'.format(s)].to_numpy(), bins=10)
    ax[i].set_xlabel("Residuals for {}".format(s))
    ax[i].set_ylabel("Frequency")
plt.show()

Regression analysis for Popularity vs. Energy 
Intercept: 
 [20.35687817]
Slope: 
 [[16.88840861]]
Coefficient of Determination: 
 0.09350707221084065
Mean Squared Error: 
 292.054481299843


Regression analysis for Popularity vs. Danceability 
Intercept: 
 [18.02216442]
Slope: 
 [[20.79434411]]
Coefficient of Determination: 
 0.04694697775983081
Mean Squared Error: 
 307.88966827878494


Regression analysis for Popularity vs. Acousticness 
Intercept: 
 [37.12370761]
Slope: 
 [[-17.79805842]]
Coefficient of Determination: 
 0.08850936708549184
Mean Squared Error: 
 294.5388335741142


rand['Time Period'] = pd.cut(x=rand['Year'], bins=5, labels = ['1922-1942', '1943-1962', '1963-1981', '1982-2001', '2002-2021'])

fig, ax = plt.subplots()
sb.violinplot(x='Time Period', y='Energy Residuals', data=rand)
ax.set_title("Energy Residuals vs. Time")
ax.set_xlabel('Time Period')
ax.set_ylabel('Residuals')
plt.show()

fig, ax = plt.subplots()
sb.violinplot(x='Time Period', y='Danceability Residuals', data=rand)
ax.set_title("Danceability Residuals vs. Time")
ax.set_xlabel('Time Period')
ax.set_ylabel('Residuals')
plt.show()

fig, ax = plt.subplots()
sb.violinplot(x='Time Period', y='Acousticness Residuals', data=rand)
ax.set_title("Acousticness Residuals vs. Time")
ax.set_xlabel('Time Period')
ax.set_ylabel('Residuals')
plt.show()


plot = ['Energy', 'Danceability', 'Acousticness']
for p in plot:
    fig, ax = plt.subplots()
    sb.violinplot(x='Time Period', y=p, data=rand)
    ax.set_title("{} vs. Time".format(p))
    ax.set_xlabel('Time Period')
    ax.set_ylabel('{}'.format(p))
    plt.show()


e_average = {}
e_std = {}

d_average = {}
d_std = {}

a_average = {}
a_std = {}
for y in rand.Year.unique():
    comp = rand.loc[rand['Year'] == y]
    e_average[y] = rand['Energy'].mean()
    e_std[y] = rand['Energy'].std()
    d_average[y] = rand['Danceability'].mean()
    d_std[y] = rand['Danceability'].std()
    a_average[y] = rand['Acousticness'].mean()
    a_std[y] = rand['Acousticness'].std()


stdzd_rand = rand.copy(deep=True)
stdzd_rand = stdzd_rand.reset_index()
stdzd_rand = stdzd_rand.drop(['Energy Residuals', 'Residuals', 'Danceability Residuals', 'Acousticness Residuals', 'Valence', 'Tempo', 'Speechiness', 'Duration'], axis=1)
stdzd_rand['Standard Energy'] = float(0)
stdzd_rand['Standard Danceability'] = float(0)
stdzd_rand['Standard Acousticness'] = float(0)

for index, rows in stdzd_rand.iterrows():
    stdzd_rand.at[index, 'Standard Energy'] = (stdzd_rand.at[index, 'Energy'] - e_average[stdzd_rand.at[index, 'Year']])/e_std[stdzd_rand.at[index, 'Year']]
    stdzd_rand.at[index, 'Standard Danceability'] = (stdzd_rand.at[index, 'Danceability'] - d_average[stdzd_rand.at[index, 'Year']])/d_std[stdzd_rand.at[index, 'Year']]
    stdzd_rand.at[index, 'Standard Acousticness'] = (stdzd_rand.at[index, 'Acousticness'] - a_average[stdzd_rand.at[index, 'Year']])/a_std[stdzd_rand.at[index, 'Year']]
    
stdzd_rand = stdzd_rand.drop(['Energy', 'Danceability', 'Acousticness', 'index'], axis=1)
stdzd_rand.head()


oldest = stdzd_rand[stdzd_rand['Time Period'] == '1922-1942']
oldest = oldest.reset_index()

older = stdzd_rand[stdzd_rand['Time Period'] == '1943-1962']
older = older.reset_index()

mid = stdzd_rand[stdzd_rand['Time Period'] == '1963-1981']
mid = mid.reset_index()

newer = stdzd_rand[stdzd_rand['Time Period'] == '1982-2001']
newer = newer.reset_index()

newest = stdzd_rand[stdzd_rand['Time Period'] == '2002-2021']
newest = newest.reset_index()


df_collection = [oldest, older, mid, newer, newest]
characteristics = ['Standard Energy', 'Standard Danceability', 'Standard Acousticness']
for table in df_collection:
    for c in characteristics:
        X = table['{}'.format(c)].to_numpy().reshape((-1,1))
        y = table['Popularity'].to_numpy().reshape((-1,1))

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

        reg = LinearRegression()

        reg.fit(X_train, y_train)
    
        intercept = reg.intercept_
        coefficient = reg.coef_
        r_squared = reg.score(X_test, y_test)
        pred = reg.predict(X_test)
        mse = mean_squared_error(y_test, pred)

        print("Regression analysis for Popularity vs. {}".format(c))
        print("Time Period: {}".format(table.at[1,'Time Period']))
        print("Intercept: \n", intercept)
        print("Slope: \n", coefficient)
        print("Coefficient of Determination: \n", r_squared)
        print("Mean Squared Error: \n", mse)
        print("\n\n")

Regression analysis for Popularity vs. Standard Energy
Time Period: 1922-1942
Intercept: 
 [12.49431334]
Slope: 
 [[4.54656561]]
Coefficient of Determination: 
 0.18155939408165145
Mean Squared Error: 
 9.219983968712825


Regression analysis for Popularity vs. Standard Danceability
Time Period: 1922-1942
Intercept: 
 [7.08484265]
Slope: 
 [[2.08344521]]
Coefficient of Determination: 
 -0.38946676686365245
Mean Squared Error: 
 11.909715144545592


Regression analysis for Popularity vs. Standard Acousticness
Time Period: 1922-1942
Intercept: 
 [9.58255028]
Slope: 
 [[-2.67498598]]
Coefficient of Determination: 
 -0.1274914616698104
Mean Squared Error: 
 63.36962215180933


Regression analysis for Popularity vs. Standard Energy
Time Period: 1943-1962
Intercept: 
 [10.28241854]
Slope: 
 [[0.17517256]]
Coefficient of Determination: 
 -0.009481035987703335
Mean Squared Error: 
 125.71763330117209


Regression analysis for Popularity vs. Standard Danceability
Time Period: 1943-1962
Intercept: 
 [10.16056537]
Slope: 
 [[-0.19262239]]
Coefficient of Determination: 
 -0.0019238394954044225
Mean Squared Error: 
 71.9625334424359


Regression analysis for Popularity vs. Standard Acousticness
Time Period: 1943-1962
Intercept: 
 [10.08572289]
Slope: 
 [[0.09417206]]
Coefficient of Determination: 
 -0.005195675083753404
Mean Squared Error: 
 104.20725684784058


Regression analysis for Popularity vs. Standard Energy
Time Period: 1963-1981
Intercept: 
 [23.34635326]
Slope: 
 [[1.14051521]]
Coefficient of Determination: 
 -0.006060557028874758
Mean Squared Error: 
 192.68521990466982


Regression analysis for Popularity vs. Standard Danceability
Time Period: 1963-1981
Intercept: 
 [23.0745588]
Slope: 
 [[-0.54227011]]
Coefficient of Determination: 
 -0.031246719287566727
Mean Squared Error: 
 193.5006526599412


Regression analysis for Popularity vs. Standard Acousticness
Time Period: 1963-1981
Intercept: 
 [24.48165504]
Slope: 
 [[-3.49816119]]
Coefficient of Determination: 
 0.003350552357349823
Mean Squared Error: 
 156.49143746476483


Regression analysis for Popularity vs. Standard Energy
Time Period: 1982-2001
Intercept: 
 [30.14607761]
Slope: 
 [[1.25014216]]
Coefficient of Determination: 
 -0.02093746566480359
Mean Squared Error: 
 199.37460315641493


Regression analysis for Popularity vs. Standard Danceability
Time Period: 1982-2001
Intercept: 
 [29.99454088]
Slope: 
 [[0.59770366]]
Coefficient of Determination: 
 0.0006718394974339903
Mean Squared Error: 
 212.75094865473588


Regression analysis for Popularity vs. Standard Acousticness
Time Period: 1982-2001
Intercept: 
 [29.40304929]
Slope: 
 [[-2.00655711]]
Coefficient of Determination: 
 0.04194263786777608
Mean Squared Error: 
 185.7982853401452


Regression analysis for Popularity vs. Standard Energy
Time Period: 2002-2021
Intercept: 
 [39.55543553]
Slope: 
 [[-0.53096196]]
Coefficient of Determination: 
 -0.0259333793409251
Mean Squared Error: 
 315.6165145156702


Regression analysis for Popularity vs. Standard Danceability
Time Period: 2002-2021
Intercept: 
 [39.17027201]
Slope: 
 [[3.26479831]]
Coefficient of Determination: 
 0.03735380573057523
Mean Squared Error: 
 245.24007643203137


Regression analysis for Popularity vs. Standard Acousticness
Time Period: 2002-2021
Intercept: 
 [39.54968209]
Slope: 
 [[-0.34562031]]
Coefficient of Determination: 
 -0.003743746785304536
Mean Squared Error: 
 308.6305598070967

	id	name	popularity	duration_ms	artists	id_artists	release_date	danceability	energy	key	loudness	mode	speechiness	acousticness	instrumentalness	liveness	valence	tempo	time_signature
0	35iwgR4jXetI318WEWsa1Q	Carve	6	126903	['Uli']	['45tIt06XoI0Iio4LBEVpls']	1922-02-22	0.645	0.4450	0	-13.338	1	0.4510	0.674	0.7440	0.151	0.127	104.851	3
1	021ht4sdgPcrDgSk7JTbKY	Capítulo 2.16 - Banquero Anarquista	0	98200	['Fernando Pessoa']	['14jtPCOoNZwquk5wd9DxrY']	1922-06-01	0.695	0.2630	0	-22.136	1	0.9570	0.797	0.0000	0.148	0.655	102.009	1
2	07A5yehtSnoedViJAZkNnc	Vivo para Quererte - Remasterizado	0	181640	['Ignacio Corsini']	['5LiOoJbxVSAMkBS2fUm3X2']	1922-03-21	0.434	0.1770	1	-21.180	1	0.0512	0.994	0.0218	0.212	0.457	130.418	5
3	08FmqUhxtyLTn6pAh6bk45	El Prisionero - Remasterizado	0	176907	['Ignacio Corsini']	['5LiOoJbxVSAMkBS2fUm3X2']	1922-03-21	0.321	0.0946	7	-27.961	1	0.0504	0.995	0.9180	0.104	0.397	169.980	3
4	08y9GfoqCWfOGsKdwojr5e	Lady of the Evening	0	163080	['Dick Haymes']	['3BiJGZsyX9sJchTqcSA7Su']	1922	0.402	0.1580	3	-16.900	0	0.0390	0.989	0.1300	0.311	0.196	103.220	4

	Popularity	Duration	Year	Danceability	Energy	Speechiness	Acousticness	Valence	Tempo
0	6	126903	1922-02-22	0.645	0.4450	0.4510	0.674	0.127	104.851
1	0	98200	1922-06-01	0.695	0.2630	0.9570	0.797	0.655	102.009
2	0	181640	1922-03-21	0.434	0.1770	0.0512	0.994	0.457	130.418
3	0	176907	1922-03-21	0.321	0.0946	0.0504	0.995	0.397	169.980
4	0	163080	1922	0.402	0.1580	0.0390	0.989	0.196	103.220

	Popularity	Duration	Year	Danceability	Energy	Speechiness	Acousticness	Valence	Tempo
0	6	126903	1922	0.645	0.4450	0.4510	0.674	0.127	104.851
1	0	98200	1922	0.695	0.2630	0.9570	0.797	0.655	102.009
2	0	181640	1922	0.434	0.1770	0.0512	0.994	0.457	130.418
3	0	176907	1922	0.321	0.0946	0.0504	0.995	0.397	169.980
4	0	163080	1922	0.402	0.1580	0.0390	0.989	0.196	103.220

	Popularity	Energy	Danceability	Valence	Tempo	Acousticness
Popularity	1.000000	0.253049	0.180074	-0.013187	0.050153	-0.297017
Energy	0.253049	1.000000	0.202052	0.367993	0.223387	-0.699404
Danceability	0.180074	0.202052	1.000000	0.502802	-0.050562	-0.187001
Valence	-0.013187	0.367993	0.502802	1.000000	0.121502	-0.167450
Tempo	0.050153	0.223387	-0.050562	0.121502	1.000000	-0.204376
Acousticness	-0.297017	-0.699404	-0.187001	-0.167450	-0.204376	1.000000

	Popularity	Year	Time Period	Standard Energy	Standard Danceability	Standard Acousticness
0	6	1945	1943-1962	-2.058710	-0.456082	1.719453
1	13	1983	1982-2001	-0.865288	0.053157	0.035528
2	27	1998	1982-2001	-1.504548	-1.370257	1.260471
3	39	2014	2002-2021	0.938486	-0.701498	-0.947110
4	8	1957	1943-1962	-0.136449	0.010209	1.025020

What Makes A Song Popular?¶

by Justin Fink¶

Data Acquisition and Tidying¶

Exploratory Data Analysis¶

Testing Different Time Periods¶

Conclusion¶