import requests
import json
import sys
import matplotlib.pyplot as plt
from matplotlib.dates import drange
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
import math
import time
import statsmodels.api as sm
import matplotlib.dates as mdates
from patsy import dmatrices
import pickle
%matplotlib inline

/opt/conda/lib/python3.9/site-packages/statsmodels/compat/pandas.py:65: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
  from pandas import Int64Index as NumericIndex


init_headers = {'User-Agent': 'uni-project-bot/1.0'}


# If data is already in directory, load that
if os.path.isfile("game_IDs.txt"): # if file exists we have already pickled a list
    with open("game_IDs.txt", 'rb') as f:
        game_IDs = pickle.load(f)
else:
    # Request every game in batches of 1000
    game_IDs = []
    offset = 0
    while(True):
        # Request and unpack 1000 games
        URL = 'https://www.speedrun.com/api/v1/games?_bulk=yes&max=1000&offset=' + str(offset)
        response = requests.get(URL, init_headers)
        data = response.json()['data']
        # Add each game to array
        for game in data:
            game_IDs.append(game['id'])
        offset += 1000
        # If length is less than max, we break
        if len(data) < 1000:
            break
print(len(game_IDs))

28755


df = pd.DataFrame(columns = ['Game','Category','Run Time','Date','Values', 'Game ID', 'Cat ID'])


#game_IDs = ['j1npme6p']
path = os.getcwd()
path += '/FinalData(2)'
# If data is already in directory, load that
if os.path.isfile(path):
    df = pd.read_csv('FinalData(2)')
    df.drop('Unnamed: 0', axis=1, inplace=True)
# Else, collect the data
else:
    maxim = 200 # Max number of runs we pull at a time, 200 is maximum allowed
    sec = 15 # Cooldown time for when 
    track = 0
    # Extracts all runs for each game, stores them in df
    # Currently accounting for crash!!! Starting from where it crashed
    for game_ID in tqdm(game_IDs):
        # Every 100 games save the dataframe to disk
        track += 1
        if track % 500 == 0:
            cwd = os.getcwd()
            path = cwd + "/DataV" + str(track/500)
            df.to_csv(path)
        same = ''
        # Get info about game, categories, and variables
        URL = 'https://www.speedrun.com/api/v1/games/' + str(game_ID) + '?embed=categories.variables'
        response = requests.get(URL,init_headers)
        data = response.json() # This has failed exactly once for reasons unknown

        try:
            data = data['data']
        except:
            # Occurs if we get a throttling error. We wait 15 seconds then try again.
            if 'status' in data and data['status'] == 420:
                while 'status' in data and data['status'] == 420:
                    time.sleep(sec)
                    response = requests.get(URL,init_headers)
                    data = response.json()
                data = data['data']
            else:
                # If other error, print and move on
                # The only error in my case was a game not being found, presumably being deleted between pulling the game and pulling the runs
                print('1b')
                print(data)
                continue
        game = data['names']['international']
        cats = data['categories']['data']

        # Finds all the runs for each category
        for categ in cats:
            cat = categ['id'] # Category ID
            cat_name = categ['name'] # Category Name
            offset = 0
            dir = 'asc'
            fin = ''
            sub_categories = [] # Collection of the variables that define subcategories
            all_vars = categ['variables']['data'] # Collects all variables of a run

            for var in all_vars:
                if var['is-subcategory']:
                    sub_categories.append(var['values']['values'])
            sub_keys = {}
            for s in sub_categories:
                # Assumed no two sub-categories in the same category will have the same variable ID
                temp_dict = dict(s)
                for t in temp_dict.keys():
                    temp_dict[t] = temp_dict[t]['label']
                sub_keys.update(temp_dict)


            # Collect data on every run. 
            while(True):
                # Asks API for verified runs from this category, ordered by date submitted
                URL = 'https://www.speedrun.com/api/v1/runs?game=' + str(game_ID) + '&category=' + str(cat) + '&orderby=submitted&direction=' + str(dir) + '&status=verified&max=' + str(maxim) + '&offset=' + str(offset)
                response = requests.get(URL,init_headers)
                data2 = response.json()
                try:
                    data2 = data2['data']
                except:
                    # Throttling error. Wait 15 seconds and try again.
                    if 'status' in data2 and data2['status'] == 420:
                        while 'status' in data2 and data2['status'] == 420:
                            time.sleep(sec)
                            response = requests.get(URL,init_headers)
                            data2 = response.json()
                        data2 = data2['data']
                    elif 'times' in data2:
                        data2 = data2
                    else:
                        # If other error, print and move on
                        print(2)
                        print(data2)
                        continue


                for run in data2:
                    # Add game, category, time, date, and options
                    sub_cat = set()
                    # We store the label of the subcategory for ease of reading
                    for var in run['values'].values():
                        if var in sub_keys:
                            sub_cat.add(sub_keys[var])
                    df.loc[len(df.index)] = [game, cat_name, run['times']['primary_t'], run['date'], sub_cat, game_ID, cat]


                # If length of collected data is smaller than maximum we can collect, we're at the end of the list and break
                if len(data2) < maxim:
                    break

                # Need to work from the back of the list if the offset is more than 10k (known bug)
                if offset + maxim >= 10000:
                    fin = data2[-1]
                    dir = 'desc'
                    offset = 0
                    continue

                # If we're working backwords and find the run we ended on going forward, we've found all runs and break
                if dir == 'desc' and fin in data2:
                    dir = 'asc'
                    fin = ''
                    break

                # If we collect 0 runs we break immediately (happens when no runs in category)
                if(len(data2) == 0):
                    break

                offset += maxim
# Convert the dates from a string to a datetime object, which is easier to use    
def time_convert(x):
        if pd.isna(x):
            return np.nan
        try:
            return datetime.strptime(x, '%Y-%m-%d')
        except:
            try:
                return datetime.strptime(x, '%Y-%m-%d')
            except:
                print(type(x))

df['Date'] = [time_convert(x) for x in df['Date']]
df


# Split range of dates in to approximately 1 month bins
bins = int(round((max(df['Date'])-min(df['Date']))/timedelta(weeks = 4.345),0))


print(bins)

605


# Cut data into the bins based on submission date
df['Date_Cut'] = pd.cut(df.Date, bins = bins)
# We don't need to know full interval for graphing, take left endpoints
def relabel(x):
    if pd.isna(x):
        return np.nan
    else:
        return x.left

df['Date_Cut'] = [relabel(x) for x in df['Date_Cut']]


# Count how many runs fall in each of the cuts
counts = df['Date_Cut'].value_counts()
counts = dict(counts)
# Plot these counts
plt.bar(*zip(*counts.items()), width = 31)
plt.title('Runs Submitted to Speedrun.Com')
plt.xlabel('Date Submitted')
plt.ylabel('Number of runs submitted')
plt.show()


# Subset of more recent data
rec = df[df['Date']  >= '01-01-14']

# Count how many runs fall in each of the cuts
tot_counts = rec['Date_Cut'].value_counts()
tot_counts = dict(tot_counts)
# Plot these counts
plt.bar(*zip(*tot_counts.items()), width = 31)
plt.title('Runs Submitted to Speedrun.Com')
plt.xlabel('Date Submitted')
plt.ylabel('Number of runs submitted')
plt.show()


plt.bar(*zip(*tot_counts.items()), width = 31)
plt.title('Runs Submitted to Speedrun.Com')
plt.xlabel('Date Submitted')
plt.yscale('log')
plt.ylabel('Number of runs submitted (log scale)')
plt.show()


# All runs with game ID associated with Minecraft: JE
mine = rec[rec['Game ID'] == 'j1npme6p']

# Count how many runs fall in each of the cuts
counts = mine['Date_Cut'].value_counts()
counts = dict(counts)
# Plot these counts
plt.bar(*zip(*counts.items()), width = 31)
plt.title('Runs Submitted to Minecraft Leaderboards')
plt.xlabel('Date Submitted')
plt.ylabel('Number of runs submitted')
plt.show()
plt.bar(*zip(*counts.items()), width = 31)
plt.title('Runs Submitted to Minecraft Leaderboards')
plt.yscale('log')
plt.xlabel('Date Submitted')
plt.ylabel('Number of runs submitted (log scale)')
plt.show()


tot_freq = pd.DataFrame.from_dict([dict(tot_counts)]).melt()
tot_freq.rename(columns = {'variable': 'Month', 'value': 'Count'}, inplace = True)
# Take log
log_count = {k: math.log(v) for k, v in counts.items()}
tot_freq["Count"] = tot_freq['Count'].apply(lambda x: math.log10(x))
# Change how time is represented as datetime objects don't fit well with statsmodels
copy = tot_freq['Month'].copy()
tot_freq['Month']=mdates.date2num(tot_freq['Month'])
tot_freq.head()


X = tot_freq['Month']
X = sm.add_constant(X)
mod = sm.OLS(tot_freq['Count'], X)
res = mod.fit()
res.params

const   -5.562301
Month    0.000554
dtype: float64


tot_freq['res'] = res.resid
tot_freq['fit'] = res.fittedvalues
tot_freq['Month'] = copy
tot_freq = tot_freq.sort_values(by = 'Month')
copy = tot_freq['Month'].copy()
fig,ax = plt.subplots()
ax.bar(tot_freq['Month'], tot_freq['Count'], width = 31)
plt.title('Runs Submitted to Speedrun.com')
plt.xlabel('Date Submitted')
plt.ylabel('Number of runs submitted (log scale)')
ax.set_ylim(2.5,5)
#plt.yscale('log')
ax2 = plt.twinx()
ax2.set_ylim(ax.get_ylim())
ax2.plot(tot_freq['Month'], tot_freq['fit'], color='k', label='Regression')
plt.show()


fig,ax = plt.subplots()
#ax.bar(tot_freq['Month'], tot_freq['Count'], width = 31)
tot_freq['exp_fit'] = tot_freq['fit'].apply(lambda x: 10**x)
plt.bar(*zip(*tot_counts.items()), width = 31)
plt.title('Runs Submitted to Speedrun.com')
plt.xlabel('Date Submitted')
plt.ylabel('Number of runs submitted')
ax2 = plt.twinx()
ax2.set_ylim(ax.get_ylim())
ax2.plot(tot_freq['Month'], tot_freq['exp_fit'], color='k', label='Regression')
plt.show()


fig,ax = plt.subplots()
ax.bar(copy, tot_freq['res'], width = 31)
plt.title('Residuals of Linear Model')
plt.xlabel('Date Submitted')
plt.ylabel('Residual of log-count')
plt.show()


res.summary2().tables[1]['P>|t|']

const    7.883564e-34
Month    2.025207e-54
Name: P>|t|, dtype: float64


# Find standardized count scores
avg_totcount = tot_freq['Count'].mean()
std_totcount = tot_freq['Count'].std()
tot_freq['Standard Count'] = (tot_freq['Count'] - avg_totcount)/std_totcount
tot_freq.head()


tot_freq['Month']=mdates.date2num(tot_freq['Month'])
X = tot_freq['Month']
X = sm.add_constant(X)
mod = sm.OLS(tot_freq['Standard Count'], X)
res = mod.fit()
res.params

const   -18.854888
Month     0.001073
dtype: float64


tot_freq['res'] = res.resid
tot_freq['fit'] = res.fittedvalues
tot_freq['Month'] = copy
tot_freq = tot_freq.sort_values(by = 'Month')
fig,ax = plt.subplots()
ax.bar(tot_freq['Month'], tot_freq['Standard Count'], width = 31)
plt.title('Month fit')
plt.xlabel('Date Submitted')
plt.ylabel('Standardized Runs')
#plt.yscale('log')
ax2 = plt.twinx()
ax2.set_ylim(ax.get_ylim())
ax2.plot(tot_freq['Month'], tot_freq['fit'], color='k', label='Regression')
plt.show()


r_sq = res.resid.apply(lambda x: x**2)
sum(r_sq)

8.650766598067708


freq = pd.DataFrame.from_dict([dict(counts)]).melt()
freq.rename(columns = {'variable': 'Month', 'value': 'Count'}, inplace = True)
# Take log
log_count = {k: math.log(v) for k, v in counts.items()}
#log_count = counts.apply(lambda x: math.log(x))
freq["Count"] = freq['Count'].apply(lambda x: math.log10(x))
# Change how time is represented as datetime objects don't fit well with statsmodels
copy = freq['Month'].copy()
freq['Month']=mdates.date2num(freq['Month'])

# Find standard scores
avg = freq['Count'].mean()
std = freq['Count'].std()
freq['Standard Count'] = (freq['Count'] - avg)/std
freq['fit'] = freq['Month'] * res.params[1] + res.params[0]
freq.head()


freq['Month'] = copy
freq['square_res'] = (freq['fit'] - freq['Standard Count'])**2
freq = freq.sort_values(by = 'Month')
copy = freq['Month'].copy()
fig,ax = plt.subplots()
ax.bar(freq['Month'], freq['Standard Count'], width = 31)
plt.title('Standardized Minecraft runs vs. Site fit')
plt.xlabel('Date Submitted')
plt.ylabel('Number of runs submitted (log scale)')
#plt.yscale('log')
ax2 = plt.twinx()
ax2.set_ylim(ax.get_ylim())
ax2.plot(freq['Month'], freq['fit'], color='k', label='Regression')
plt.show()


r_sq = freq['square_res'].sum()
r_sq

36.09247732952555


freq['Month']=mdates.date2num(freq['Month'])
X = freq['Month']
X = sm.add_constant(X)
mod = sm.OLS(freq['Count'], X)
res = mod.fit()
res.params

const   -13.530621
Month     0.000842
dtype: float64


freq['res'] = res.resid
freq['fit'] = res.fittedvalues
freq['Month'] = copy
freq = freq.sort_values(by = 'Month')
fig,ax = plt.subplots()
plt.title('Runs Submitted to Minecraft Leaderboards')
plt.xlabel('Date Submitted')
plt.ylabel('Number of runs (log scale)')
ax.bar(freq['Month'], freq['Count'], width = 31)
ax2 = plt.twinx()
ax2.set_ylim(ax.get_ylim())
ax2.plot(freq['Month'], freq['fit'], color='k', label='Regression')
plt.show()


fig,ax = plt.subplots()
ax.bar(copy, freq['res'], width = 31)
plt.title('Residuals of Minecraft Linear Model')
plt.xlabel('Date Submitted')
plt.ylabel('Residual of log-count')
plt.show()


res.summary2().tables[1]['P>|t|']

const    8.032060e-21
Month    3.208438e-23
Name: P>|t|, dtype: float64


freq['Month']=mdates.date2num(freq['Month'])
start = mdates.datestr2num('03/16/2020')
#mdates.date2num(freq['Month']) 'Mar 16, 2020'
end = mdates.datestr2num('12/11/2020')
freq['Dream'] = freq['Month'].between(start,end)
# 1 if month occurs in 'peak Dream,' 0 otherwise
freq['Dream'] = freq['Dream'].apply(lambda x: 1 if x else 0)
freq.head()


y,X = dmatrices('Count ~ Month*Dream',freq, return_type = 'dataframe')
y = np.ravel(y)
X.head()


mod = sm.OLS(y,X)
fit = mod.fit()
fit.params

Intercept     -12.196165
Month           0.000762
Dream         -35.893774
Month:Dream     0.001983
dtype: float64


freq['res'] = fit.resid
freq['fit'] = fit.fittedvalues
freq['Month'] = copy
freq = freq.sort_values(by = 'Month')
fig,ax = plt.subplots()
ax.bar(freq['Month'], freq['Count'], width = 31)
plt.title('Dream Minecraft Model')
plt.ylabel('Number of Runs (log scale)')
plt.xlabel('Date')
ax2 = plt.twinx()
ax2.set_ylim(ax.get_ylim())
ax2.plot(freq['Month'], freq['fit'], color='k', label='Regression')
plt.show()

fig,ax = plt.subplots()
freq['exp_fit'] = freq['fit'].apply(lambda x: 10**x)
plt.bar(*zip(*counts.items()), width = 31)
plt.title('Dream Minecraft Model')
plt.xlabel('Date Submitted')
plt.ylabel('Number of runs')
ax2 = plt.twinx()
ax2.set_ylim(ax.get_ylim())
ax2.plot(freq['Month'], freq['exp_fit'], color='k', label='Regression')
plt.show()


r_sq = fit.resid.apply(lambda x: x**2)
sum(r_sq)

24.85337402573328


fit.summary2().tables[1]['P>|t|']

Intercept      1.069543e-18
Month          6.716696e-21
Dream          3.788544e-01
Month:Dream    3.690292e-01
Name: P>|t|, dtype: float64


freq['Month']=mdates.date2num(freq['Month'])
freq['Cheat'] = freq['Month'].apply(lambda x: 1 if x > end else 0)
freq.head()


y,X = dmatrices('Count ~ Month*Dream + Month*Cheat',freq, return_type = 'dataframe')
y = np.ravel(y)
mod = sm.OLS(y,X)
fit = mod.fit()
fit.params

Intercept      -6.464687
Month           0.000423
Dream         -41.625253
Month:Dream     0.002322
Cheat          61.912920
Month:Cheat    -0.003224
dtype: float64


freq['res'] = fit.resid
freq['fit'] = fit.fittedvalues
freq['Month'] = copy
freq = freq.sort_values(by = 'Month')
fig,ax = plt.subplots()
ax.bar(freq['Month'], freq['Count'], width = 31)
plt.title('Dream popularity AND cheating Minecraft model')
plt.ylabel('Number of runs (log scale)')
plt.xlabel('Date')
ax2 = plt.twinx()
ax2.set_ylim(ax.get_ylim())
ax2.plot(freq['Month'], freq['fit'], color='k', label='Regression')
plt.show()

fig,ax = plt.subplots()
#ax.bar(tot_freq['Month'], tot_freq['Count'], width = 31)
freq['exp_fit'] = freq['fit'].apply(lambda x: 10**x)
plt.bar(*zip(*counts.items()), width = 31)
plt.title('Dream popularity AND cheating Minecraft model')
plt.xlabel('Date Submitted')
plt.ylabel('Number of runs submitted')
ax2 = plt.twinx()
ax2.set_ylim(ax.get_ylim())
ax2.plot(freq['Month'], freq['exp_fit'], color='k', label='Regression')
plt.show()


fit.summary2().tables[1]['P>|t|']

Intercept      2.191631e-07
Month          9.302297e-09
Dream          1.660017e-01
Month:Dream    1.535356e-01
Cheat          5.530645e-06
Month:Cheat    7.928875e-06
Name: P>|t|, dtype: float64


freq['res'] = fit.resid
fig,ax = plt.subplots()
ax.bar(copy, freq['res'], width = 30)
plt.title('New Model Residuals')
plt.ylabel('Residual')
plt.show()

	Game	Category	Run Time	Date	Values	Game ID	Cat ID
0	Bibi & Tina: New Adventures With Horses	Main Missions	3531.0	2022-04-21	set()	ldej22j1	wdmm094d
1	Bibi & Tina: New Adventures With Horses	Main Missions	3482.0	2022-04-22	set()	ldej22j1	wdmm094d
2	Bibi & Tina: New Adventures With Horses	Main Missions	3396.0	2022-04-23	set()	ldej22j1	wdmm094d
3	Bibi & Tina: New Adventures With Horses	Main Missions	3346.0	2022-04-26	set()	ldej22j1	wdmm094d
4	Burger & Frights	Any%	906.0	2021-09-01	set()	3698y4ld	zdnzx59d
...	...	...	...	...	...	...	...
2580131	暖雪 Warm Snow	White Ash% NMG	1045.0	2022-04-19	set()	v1pxz946	ndxnwvvk
2580132	暖雪 Warm Snow	Fresh File% NMG	2569.0	2022-02-10	set()	v1pxz946	vdoy5my2
2580133	暖雪 Warm Snow	Fresh File% NMG	2351.0	2022-04-21	set()	v1pxz946	vdoy5my2
2580134	暖雪 Warm Snow	Fresh File% NMG	1676.0	2022-04-21	set()	v1pxz946	vdoy5my2
2580135	鬼神童子ZENKI	Any%	1390.0	2021-08-10	set()	9d387701	5dw180ek

	Month	Count
0	18631.705785	4.951022
1	18783.672727	4.945469
2	18662.099174	4.929122
3	18692.492562	4.925451
4	18722.885950	4.897220

	Month	Count	res	fit	exp_fit	Standard Count
100	2013-12-09 06:25:35.206611456	2.721811	-0.603005	3.324815	2112.590560	-2.799662
98	2014-01-08 15:52:03.966942208	3.062206	-0.279441	3.341646	2196.071160	-2.139951
99	2014-02-08 01:18:32.727272704	3.054613	-0.303864	3.358477	2282.850559	-2.154666
97	2014-03-10 10:45:01.487603200	3.088136	-0.287172	3.375309	2373.059111	-2.089696
96	2014-04-09 20:11:30.247933952	3.130012	-0.262128	3.392140	2466.832322	-2.008537

	Month	Count	Standard Count	fit
0	18662.099174	3.075182	1.922728	1.174316
1	18692.492562	3.068928	1.915992	1.206935
2	18631.705785	3.025715	1.869447	1.141696
3	18722.885950	3.022841	1.866351	1.239555
4	18753.279339	2.943495	1.780886	1.272175

	Month	Count	Standard Count	fit	square_res	res
94	16078.661157	0.000000	-1.389596	0.006438	0.043588	-0.006438
76	16109.054545	0.477121	-0.875682	0.032027	0.476200	0.445095
90	16139.447934	0.301030	-1.065352	0.057616	0.218820	0.243414
96	16169.841322	0.000000	-1.389596	0.083205	0.012303	-0.083205
80	16200.234711	0.477121	-0.875682	0.108794	0.350716	0.368327

Did Dream Make or Break the Minecraft Speedrunning Scene?¶

A data analysis by Johnny Rajala¶

Introduction¶

Collecting the data¶

Exploratory Data Analysis¶

Linear Regressions¶

Discussion and Future Research¶

	Intercept	Month
94	1.0	16078.661157
76	1.0	16109.054545
90	1.0	16139.447934
96	1.0	16169.841322
80	1.0	16200.234711