Design a site like this with WordPress.com
Get started

Board Game Regressor

Before we dive into machine learning, we’re going to explore a dataset, and figure out what might be interesting to predict. The dataset is from

BoardGameGeek, and contains data on 80000 board games. Here’s a single boardgame on the site. This information was kindly scraped into csv format by Sean Beck, and can be downloaded here. The dataset contains several data points about each board game.

The first step in our exploration is to read in the data and print some quick summary statistics and we enhanced the tutorial with a decision tree feature importance.

One of the nice things about Scikit-learn is that it enables us to try more powerful algorithms very easily. One such algorithm is called

random forest. The random forest algorithm can find nonlinearities in data that a linear regression wouldn’t be able to pick up on.

import re, math
from collections import Counter
import numpy as np
import pandas
import matplotlib.pyplot as plt

text1 = 'How can I be a geologist?'
text2 = 'What should I do to be a geologist?'

class Similarity():
    def compute_cosine_similarity(self, string1, string2):
         print(string1, string2)
         # intersects the words that are common
         # in the set of the two words
         intersection = set(string1.keys()) & set(string2.keys())
         # dot matrix of vec1 and vec2
         numerator = sum([string1[x] * string2[x] for x in intersection])

         # sum of the squares of each vector
         # sum1 is the sum of text1 and same for sum2 for text2
         sum1 = sum([string1[x]**2 for x in string1.keys()])
         sum2 = sum([string2[x]**2 for x in string2.keys()])

         # product of the square root of both sum(s)
         denominator = math.sqrt(sum1) * math.sqrt(sum2)
         if not denominator:
            return 0.0
         else:
            return round(numerator/float(denominator),4)

    def text_to_vector(self,text):
        WORD = re.compile(r'\w+')
        words = WORD.findall(text)
        print(words)
        return Counter(words)

    def text_to_vector2(self,atext):
        atex = atext.lower().split(" ")
        print(atex)
        return Counter(atex)

    # Jaccard Similarity
    def tokenize(self,string):
        return string.lower().split(" ")

    def tokenize2(self,string):
        return string.lower().split(" ")

    def jaccard_similarity(self, string1, string2):
        intersection = set(string1).intersection(set(string2))
        union = set(string1).union(set(string2))
        return len(intersection)/float(len(union))

similarity = Similarity()

# vector space
vector1 = similarity.text_to_vector2(text1)
vector2 = similarity.text_to_vector2(text2)

# split words into tokens
token1 = similarity.tokenize2(text1)
token2 = similarity.tokenize2(text2)

cosine = similarity.compute_cosine_similarity(vector1, vector2)
print ('Cosine Similarity:', cosine)

jaccard = similarity.jaccard_similarity(token1,token2)
print ('Jaccard Similarity:', jaccard)

#https://www.dataquest.io/blog/machine-learning-python/

games = pandas.read_csv(r"C:\maXbox\mX46210\DataScience\games.csv")
# Print the nameof the columns in games.
print(games.columns)
print(games.shape)

#plt.hist(games["average_rating"])
#plt.show()

games[games["average_rating"] == 0]
print(games[games["average_rating"] == 0].iloc[0])

games = games[games["users_rated"] > 0]
# Remove any rowy with missing values.
games = games.dropna(axis=0)
print(games.shape)

from sklearn.cluster import KMeans

kmeans_model = KMeans(n_clusters=5, random_state=1)
good_columns = games._get_numeric_data()
kmeans_model.fit(good_columns)
labels = kmeans_model.labels_
#centroids = kmeans_model.centroids_
print(labels)

#print(kmeans_model.cluster_centers_)
centers = np.array(kmeans_model.cluster_centers_)

plt.plot()
plt.title('maXbox k-means centroids')
colors = ['b', 'g', 'c','r','y']
markers = ['o', 'v', 's']

"""
x1 = np.array([3, 1, 1, 2, 1, 6, 6, 6, 5, 6, 7, 8, 9, 8, 9, 9, 8])
x2 = np.array([5, 4, 6, 6, 5, 8, 6, 7, 6, 7, 1, 2, 1, 2, 3, 2, 3])
"""
#"""
#label = kmeans_model.fit_predict(good_columns)
#print(label)
X= good_columns.values
u_labels = np.unique(labels)
for i in u_labels:          #(u_labels, kmeans_model.labels_):
    #plt.plot(labels[:i,0], labels[:i,1], color=colors[l], marker=markers[l],ls='None')
    plt.scatter(X[labels == i ,0] , \
                X[labels == i ,1],color=colors[i],label=u_labels[i]) #color=colors[i])-labels=i
    # plt.scatter(centers[:,0], centers[:,1], marker="x", color=colors)
    #plt.xlim([0, 10])
    #plt.ylim([0, 10])
#"""
#plt.scatter(centers[:,0], centers[:,1], marker="x", color='r')
plt.scatter(centers[:,0] ,centers[:,1], marker="x", s= 250, color= 'k') #'k' - colors)
plt.legend()
plt.show()

from sklearn.decomposition import PCA

#"""
pca_2 = PCA(3)  #2 or 3?
plt.title('maXbox PCA fit_transform')
plot_columns = pca_2.fit_transform(good_columns)
plt.scatter(x=plot_columns[:,0], y=plot_columns[:,1], c=labels)
# plt.scatter(centers[:,0], centers[:,1], marker="x", color='b')
plt.show()
#"""
print(games.corr()["average_rating"])

# Get all the columns from the dataframe.
columns = games.columns.tolist()
# Filter the columns to remove ones we don't want.
columns = [c for c in columns if c not in ["bayes_average_rating","average_rating","type","name"]]

# Store the variable we'll be predicting on.
target = "average_rating"

# Import a convenience function to split the sets.
from sklearn.model_selection import train_test_split

# Generate the training set.  Set random_state to be able to replicate results.
train = games.sample(frac=0.8, random_state=1)
# Select anything not in the training set and put it in the testing set.
test = games.loc[~games.index.isin(train.index)]
# Print the shapes of both sets.
print('train shape: ',train.shape)
print('test shape: ',test.shape)

# Import the linearregression model.
from sklearn.linear_model import LinearRegression

# Initialize the model class.
model = LinearRegression()
# Fit the model to the training data.
model.fit(train[columns], train[target])

# Import the scikit-learn function to compute error.
from sklearn.metrics import mean_squared_error
# Generate our predictions for the test set.
predictions = model.predict(test[columns])
# Compute error between our test predictions and the actual values.
print(mean_squared_error(predictions, test[target]))
print(mean_squared_error(test[target], predictions))

from sklearn.metrics import r2_score
print(r2_score(test[target], predictions))

# Import the random forest model.
from sklearn.ensemble import RandomForestRegressor

# Initialize the model with some parameters.
model = RandomForestRegressor(n_estimators=100, min_samples_leaf=10, random_state=1)
# Fit the model to the data.
model.fit(train[columns], train[target])
# Make predictions.
predictions = model.predict(test[columns])
# Compute the error.
print('mse ',mean_squared_error(predictions, test[target]))
print('r2 ',r2_score(test[target], predictions))

plt.title('maXbox Game Ratings Feature Importance')
#print(model.feature_importances_)
plt.xlabel("maXbox Random Forest Feature Importance")
plt.barh(columns, model.feature_importances_)
#plt.show()
sorted_idx = model.feature_importances_.argsort()
#TypeError: only integer scalar arrays can be converted to a scalar index
# plt.barh(np.array([columns[sorted_idx]]), model.feature_importances_[sorted_idx])
#plt.barh(columns[sorted_idx], list(model.feature_importances_[sorted_idx]))
#plt.xlabel("maXbox Random Forest Feature Importance")
plt.show()
 

#https://github.com/ThaWeatherman/scrapers/blob/master/boardgamegeek/spider.py
#https://mljar.com/blog/feature-importance-in-random-forest/
#----app_template_loaded_code----
#----File newtemplate.txt not exists - now saved!----

And the output:

C:\maXbox\mX46210\maxbox4>py ..\DataScience\gamegeek_similarity_py.txt
[‘how’, ‘can’, ‘i’, ‘be’, ‘a’, ‘geologist?’]
[‘what’, ‘should’, ‘i’, ‘do’, ‘to’, ‘be’, ‘a’, ‘geologist?’]
Counter({‘how’: 1, ‘can’: 1, ‘i’: 1, ‘be’: 1, ‘a’: 1, ‘geologist?’: 1}) Coun
{‘what’: 1, ‘should’: 1, ‘i’: 1, ‘do’: 1, ‘to’: 1, ‘be’: 1, ‘a’: 1, ‘geologi: 1})
Cosine Similarity: 0.5774
Jaccard Similarity: 0.4
Index([‘id’, ‘type’, ‘name’, ‘yearpublished’, ‘minplayers’, ‘maxplayers’,
‘playingtime’, ‘minplaytime’, ‘maxplaytime’, ‘minage’, ‘users_rated’,
‘average_rating’, ‘bayes_average_rating’, ‘total_owners’,
‘total_traders’, ‘total_wanters’, ‘total_wishers’, ‘total_comments’,
‘total_weights’, ‘average_weight’],
dtype=’object’)
(81312, 20)
id 318
type boardgame
name Looney Leo
yearpublished 0
minplayers 0
maxplayers 0
playingtime 0
minplaytime 0
maxplaytime 0
minage 0
users_rated 0
average_rating 0
bayes_average_rating 0
total_owners 0
total_traders 0
total_wanters 0
total_wishers 1
total_comments 0
total_weights 0
average_weight 0
Name: 13048, dtype: object
(56894, 20)
[2 1 1 … 4 4 4]
id 0.304201
yearpublished 0.108461
minplayers -0.032701
maxplayers -0.008335
playingtime 0.048994
minplaytime 0.043985
maxplaytime 0.048994
minage 0.210049
users_rated 0.112564
average_rating 1.000000
bayes_average_rating 0.231563
total_owners 0.137478
total_traders 0.119452
total_wanters 0.196566
total_wishers 0.171375
total_comments 0.123714
total_weights 0.109691
average_weight 0.351081
Name: average_rating, dtype: float64
train shape: (45515, 20)
test shape: (11379, 20)
1.8239281903519875
1.8239281903519875
0.268394771387396
mse 1.414465540054245
r2 0.4326364435453288

C:\maXbox\mX46210\maxbox4>

Below, we exploit the fact that every Pandas row has a unique index to select any row not in the training set to be in the testing set.

# Generate training set. Set random_state to be able to replicate results.
train = games.sample(frac=0.8, random_state=1)
# Select anything not in the training set and put it in the testing set.
test = games.loc[~games.index.isin(train.index)]
# ~ means Not!
Fitting a target function with different-degree polynomials – Deep Learning for NLP and Speech Recognition, Springer 2018,

Given a line and a point not on the line, construct a line through the point and perpendicular to the line.  The trick here is to determine the slope of the given line,  m, and take advantage of the fact that the slope of  a perpendicular line is -1/m.

HMI Metall Edition

Published by maxbox4

Code till the End

One thought on “Board Game Regressor

  1. [maxbox:news] V4.7.5.20 Released discussion
    new units: 01 RotImg.pas + uModel : TModel lib dmath
    02 SimpleImageLoader.pas
    03 systemsdiagram.pas + fpc switch
    04 qsfoundation.pas NO FPC Vector operator
    05 prediction.pas SimulationEngine missing
    06 HSLUtils.pas – color model
    07 cInternetUtils.pas – header information
    08 cWindows.pas – cstrings routines as flcSysUtils
    09 flcSysUtils.pas //2 functions with cwindows possible+ freqObj +TBytes utils
    10 GraphicsMathLibrary.PAS gml-profix
    11 flcBits32.pas //$IFDEF DEBUG} {$IFDEF TEST} procedure Test;
    12 flcFloats.pas No Floats instead: uPSI_cBlaiseParserLexer.pas
    13 flcDecimal.pas + TestClass
    14 flcCharSet.pas //test include wrapper + –
    15 flcComplex.pas -Class
    16 flcMaths.pas //{$IFDEF MATHS_TEST} procedure Test
    17 flcMatrix.pas – less TVectors
    18 flcRational.pas -Class
    19 flcStatistics.pas -Class
    20 flcStringBuilder.pas – less Unicode
    21 flcVectors.pas No Vectors cause compatibility
    22 flcTimers.pas {$DEFINE TIMERS_TEST}

    Release Notes maXbox 4.7.5.20 Jan 2021 mX47

    Add 22 Units + 4 Tutorials

    1277 unit uPSI_SystemsDiagram.pas Dendron
    1278 unit uPSI_qsFoundation.pas Dendron
    1279 uPSI_JclStringLists2 JCL
    1280 uPSI_cInternetUtils2 FLC
    1281 uPSI_cWindows.pas FLC
    1282 uPSI_flcSysUtils.pas +TBytes utils
    1283 unit uPSI_RotImg.pas DA
    1284 uPSI_SimpleImageLoader.pas LAZ
    1285 uPSI_HSLUtils.pas LAZ
    1286 uPSI_GraphicsMathLibrary.pas EF
    1287 unit uPSI_umodels.pas DMath
    1288 uPSI_flcStatistics.pas FLC5
    1289 uPSI_flcMaths.pas FLC5
    1290 uPSI_flcCharSet.pas
    1291 uPSI_flcBits32.pas
    1292 uPSI_flcTimers.pas
    1293 uPSI_cBlaiseParserLexer.pas
    1294 uPSI_flcRational.pas
    1295 uPSI_flcComplex.pas
    1296 unit uPSI_flcMatrix (uPSI_flcVectors.pas)
    1297 unit uPSI_flcStringBuilder.pas
    1298 unit PJResFile_Routines;

    Like

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s

%d bloggers like this: