aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJan Westerdiep <janner@gmail.com>2016-05-29 15:36:54 +0200
committerJan Westerdiep <janner@gmail.com>2016-05-29 15:36:54 +0200
commitd3a3fd7205969e0f6e930e9e5dd2f3e004d4c2dd (patch)
treed605c12251db168fc493a54c6989014be6187ea4
parent66f8711c7ff6e05ade7e7ec39bee7e783bafb265 (diff)
adding stray files important to the full project code; minor and mostly cosmetic changes in existing files
-rw-r--r--Assignment 2/exploration.py47
-rw-r--r--Assignment 2/spelen/feature_importances.py21
-rw-r--r--Assignment 2/spelen/ranklib2submission.py12
-rw-r--r--Assignment 2/spelen/run_lambdamart.py16
-rw-r--r--Assignment 2/to_svmlight.py23
5 files changed, 88 insertions, 31 deletions
diff --git a/Assignment 2/exploration.py b/Assignment 2/exploration.py
index afb0f25..6c4b5c2 100644
--- a/Assignment 2/exploration.py
+++ b/Assignment 2/exploration.py
@@ -8,14 +8,17 @@ import matplotlib.pyplot as plt
import pandas as pd
from matplotlib import colors
-train = pd.read_csv("training_set_VU_DM_2014.csv", header=0, parse_dates=[1])
+nrows = None
+train = pd.read_csv("training_set_VU_DM_2014.csv", header=0, nrows=nrows, parse_dates=[1])
+train["prop_log_historical_price"] = train.prop_log_historical_price.replace(0.0, np.nan)
+train["prop_historical_price"] = (np.e ** train["prop_log_historical_price"])
nrows = train.shape[0]
# create a nice plot with x-axis the features, and y-axis the percentage of missing data
def barplot():
- fig, ax = plt.subplots(figsize=(1200/80, 500/80))
+ fig, ax = plt.subplots(figsize=(1200/120, 500/120))
d = train.isnull().sum().to_dict()
- items = sorted(d.items(), key=lambda kv: kv[1])
+ items = sorted(d.items(), key=lambda kv: (kv[1],kv[0]))
bp = sns.barplot(map(lambda kv: kv[0], items),
map(lambda kv: 100*kv[1]/nrows, items),
@@ -25,25 +28,33 @@ def barplot():
for item in bp.get_xticklabels():
item.set_rotation(90)
- plt.subplots_adjust(bottom=0.4)
+ plt.subplots_adjust(bottom=0.5)
plt.savefig("barplot", dpi=400)
def outliers():
- cols = ['visitor_hist_adr_usd', 'price_usd', 'gross_bookings_usd', 'orig_destination_distance',
- 'comp1_rate_percent_diff',
- 'comp2_rate_percent_diff',
- 'comp3_rate_percent_diff',
+ cols = [
+ 'prop_location_score2',
+ 'prop_location_score1',
+ 'comp1_rate_percent_diff',
'comp4_rate_percent_diff',
- 'comp5_rate_percent_diff',
- 'comp6_rate_percent_diff',
- 'comp7_rate_percent_diff',
- 'comp8_rate_percent_diff']
- fig, axarr = plt.subplots(2, sharex=True, figsize=(1200/80, 500/80))
+ 'srch_booking_window',
+ 'price_usd',
+ 'visitor_hist_adr_usd',
+ 'prop_historical_price',
+ 'gross_bookings_usd',
+ 'orig_destination_distance',
+ ]
+ fig, axarr = plt.subplots(figsize=(1200/120, 300/120))
#ax = train[cols].boxplot(rot=90, return_type="axes", sym='k.', showfliers=True)
- ax = sns.boxplot(train[cols], orient='h', whis=0.1, ax=axarr[0], fliersize=2)
- ax.set_xscale('log')
- ax = sns.boxplot(train[cols], orient='h', ax=axarr[1], fliersize=2)
+ """
+ train[cols].boxplot(vert=False, sym='k.')
+ plt.xscale('log')
+ """
+ vals = [x.dropna() for y,x in train[cols].iteritems()]
+ ax = sns.boxplot(train[cols], orient='h', fliersize=2)
ax.set_xscale('log')
+ plt.xlim(xmin=1e-4)
+ plt.subplots_adjust(left=0.3, bottom=0.1)
plt.savefig("outliers", dpi=400)
def dateplot():
@@ -55,13 +66,13 @@ def dateplot():
else:
return ''
- df2 = train[train.booking_bool == 1]
+ df2 = train
dates = [(a.year, a.month, a.day) for a in df2.date_time]
c = sorted(Counter(dates).items(), key=lambda kv: kv[0][0]*10000 + kv[0][1]*100 + kv[0][2])
datesort = [kv[0][0]*10000 + kv[0][1]*100 + kv[0][2] for kv in c]
names = [name(x) for x,y in c]
- fig, axarr = plt.subplots(2, sharex=True, figsize=(1200/80, 500/80))
+ fig, axarr = plt.subplots(2, sharex=True, figsize=(1200/120, 500/120))
palette = sns.color_palette('hls', n_colors=12)
clrs = [palette[d[0][1]-1] for d in c]
bp1 = sns.barplot(datesort, [y for x,y in c], palette = clrs, ax=axarr[0], linewidth=0)
diff --git a/Assignment 2/spelen/feature_importances.py b/Assignment 2/spelen/feature_importances.py
new file mode 100644
index 0000000..d90bb2f
--- /dev/null
+++ b/Assignment 2/spelen/feature_importances.py
@@ -0,0 +1,21 @@
+from rankpy.models import LambdaMART
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+with open("test.svmlight") as f:
+ head = [next(f) for x in range(4)]
+features = map(lambda x: x.split(':')[1], head[-1][2:-1].split(' '))
+columns = pd.read_csv("../test_set_VU_DM_2014.csv", header=0, nrows = 1).columns.values.tolist()
+lm = LambdaMART.load("LambdaMartModel0.5.model")
+feats = dict(zip(features, lm.feature_importances()))
+feats = sorted(feats.items(), key=lambda kv: -kv[1])
+
+fig, ax = plt.subplots(figsize=(1200/120, 500/120))
+bp = sns.barplot( map(lambda x: x[0], feats), map(lambda x: x[1], feats))
+
+for item in bp.get_xticklabels():
+ item.set_rotation(90)
+plt.subplots_adjust(bottom=0.5)
+
+plt.savefig("feature_importances", dpi=400)
diff --git a/Assignment 2/spelen/ranklib2submission.py b/Assignment 2/spelen/ranklib2submission.py
new file mode 100644
index 0000000..a217a94
--- /dev/null
+++ b/Assignment 2/spelen/ranklib2submission.py
@@ -0,0 +1,12 @@
+import time
+import pandas as pd
+
+if __name__ == "__main__":
+ ranklib_df = pd.read_csv("model_ranklib_4885.scores", sep="\t", header=None, names=['srch_id', 'local_prod_id', 'score'])
+ test_df = pd.read_csv("../test_set_VU_DM_2014.csv", header=0, usecols=['srch_id', 'prop_id'])
+ test_df['score'] = -ranklib_df['score']
+ sorted_df = test_df[['srch_id', 'prop_id', 'score']].sort_values(['srch_id', 'score'])
+ sorted_df.score = -sorted_df.score
+
+ submission = pd.DataFrame({ 'SearchId': sorted_df.srch_id, 'PropertyId': sorted_df.prop_id})[['SearchId', 'PropertyId']]
+ submission.to_csv('model_scoring_4965.csv', index=False)
diff --git a/Assignment 2/spelen/run_lambdamart.py b/Assignment 2/spelen/run_lambdamart.py
index d7459b3..baa3e5c 100644
--- a/Assignment 2/spelen/run_lambdamart.py
+++ b/Assignment 2/spelen/run_lambdamart.py
@@ -15,9 +15,9 @@ from rankpy.models import LambdaMART
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO)
# Load the query datasets.
-training_queries = Queries.load_from_text('train.svmlight')
-validation_queries = Queries.load_from_text('vali.svmlight')
-test_queries = Queries.load_from_text('test.svmlight')
+training_queries = Queries.load_from_text('train_without_means.svmlight')
+validation_queries = Queries.load_from_text('vali_without_means.svmlight')
+test_queries = Queries.load_from_text('test_without_means.svmlight')
logging.info('================================================================================')
@@ -48,17 +48,17 @@ logging.info('Test queries: %s' % test_queries)
logging.info('================================================================================')
model = LambdaMART(metric='nDCG@38', max_leaf_nodes=7, shrinkage=0.1,
- estopping=4, n_jobs=-1, min_samples_leaf=50,
+ estopping=10, n_jobs=-1, min_samples_leaf=50,
random_state=42)
#TODO: do some crossval here?
-model.fit(training_queries, validation_queries=validation_queries)
+model.fit(train_queries, validation_queries=test_queries)
logging.info('================================================================================')
logging.info('%s on the test queries: %.8f'
- % (model.metric, model.evaluate(validation_queries, n_jobs=-1)))
+ % (model.metric, model.evaluate(test_queries, n_jobs=-1)))
-model.save('LambdaMART_L7_S0.1_E50_' + model.metric)
+model.save('LambdaMART_L7_S0.1_E50_without_' + model.metric)
predicted_rankings = model.predict_rankings(test_queries)
test_df = pd.read_csv("../test_set_VU_DM_2014.csv", header=0, nrows = test_queries.document_count())
@@ -66,4 +66,4 @@ test_df['pred_position'] = np.concatenate(predicted_rankings)
sorted_df = test_df[['srch_id', 'prop_id', 'pred_position']].sort_values(['srch_id', 'pred_position'])
submission = pd.DataFrame({ 'SearchId': sorted_df.srch_id, 'PropertyId': sorted_df.prop_id })[['SearchId', 'PropertyId']]
-submission.to_csv('model_%d_%f.csv' % (test_queries.document_count(), model.evaluate(validation_queries, n_jobs=-1)), index=False)
+submission.to_csv('model_%d2_%f.csv' % (test_queries.document_count(), model.evaluate(test_queries, n_jobs=-1)), index=False)
diff --git a/Assignment 2/to_svmlight.py b/Assignment 2/to_svmlight.py
index 4f115a1..d12be8e 100644
--- a/Assignment 2/to_svmlight.py
+++ b/Assignment 2/to_svmlight.py
@@ -64,7 +64,7 @@ def preprocess(df, train):
nrows = None
-# nrows = int(1e5)
+#nrows = int(1e5)
data_train = pd.read_csv("training_set_VU_DM_2014.csv", header=0, parse_dates=[1], nrows=nrows)
try:
data_test = pd.read_csv("testsetnew.csv", header=0, parse_dates=[1], nrows=nrows)
@@ -73,11 +73,23 @@ except IOError:
print("loaded csv's")
+# pre-fill missing prop_location_score2 scores with first quartile of country:
+# source Bing Xu et al (forth place)
+all_data = pd.concat([data_train, data_test], copy=False)
+location_quartile = all_data.groupby("prop_country_id")["prop_location_score2"].quantile(q=0.25)
+
+for d in (data_train, data_test):
+ d["prop_location_score2_quartile"] = location_quartile[d.prop_id].values
+ d["prop_location_score2"].fillna(d["prop_location_score2_quartile"])
+ del d["prop_location_score2_quartile"]
+
+
# fill missing values with worst case scenario. Source: Jun Wang 3rd place
# ["prop_review_score", "prop_location_score2", "orig_destination_distance"]
data_train = data_train.fillna(value=-1)
data_test = data_test.fillna(value=-1)
+"""
# feature engineering using all numeric features
# avg/median/std numeric features per prop_id
numeric_features = ["prop_starrating", "prop_review_score", "prop_location_score1", "prop_location_score2"]
@@ -91,7 +103,8 @@ for label in numeric_features:
for d in (data_train, data_test):
d[label + "_mean"] = mean[d.prop_id].values
d[label + "_median"] = median[d.prop_id].values
- d[label + "_mean"] = std[d.prop_id].values
+ d[label + "_std"] = std[d.prop_id].values
+ """
train, Xtr, qtr, ytr, feature_labels = preprocess(data_train[data_train.srch_id % 10 != 0], train=True)
@@ -115,6 +128,6 @@ p = Pool()
# dump_svmlight_file(Xtr, ytr, 'spelen/train.svmlight', query_id=qtr, comment=comment)
# dump_svmlight_file(Xva, yva, 'spelen/vali.svmlight', query_id=qva, comment=comment)
# dump_svmlight_file(Xte, np.zeros(len(data_test)), 'spelen/test.svmlight', query_id=qte, comment=comment)
-p.map(dump, ((Xtr, ytr, 'spelen/train.svmlight', qtr, comment),
- (Xva, yva, 'spelen/vali.svmlight', qva, comment),
- (Xte, yte, 'spelen/test.svmlight', qte, comment)))
+p.map(dump, ((Xtr, ytr, 'spelen/train_without_means.svmlight', qtr, comment),
+ (Xva, yva, 'spelen/vali_without_means.svmlight', qva, comment),
+ (Xte, yte, 'spelen/test_without_means.svmlight', qte, comment)))