Skip to content

Commit

Permalink
Merge pull request #2 from adiIspas/lightfm-version
Browse files Browse the repository at this point in the history
Lightfm version
  • Loading branch information
adiIspas committed May 11, 2019
2 parents f57fc95 + 734f4ec commit a2a92b5
Show file tree
Hide file tree
Showing 28 changed files with 20,773 additions and 0 deletions.
57 changes: 57 additions & 0 deletions dataset/utils/collect_posters_in_clusters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import os
import pandas as pd
import csv
import shutil

dataset = '../../../king-rec-dataset/ml-latest-small/images/'
number_of_clusters = 7
model = 'vgg16'
clusters_dir = '../../../king-rec-dataset/ml-latest-small/results/clusters/sanity-check/' + model + '/' + str(number_of_clusters) + '/'


def collect_posters():
data = pd.read_csv('sanity_check_movies_1_poster_clusters_' + model + '.csv')

# create directories
for idx in range(1, number_of_clusters + 1):
os.makedirs(clusters_dir + str(idx), exist_ok=True)

# move posters into associated cluster
for index, row in data.iterrows():
src = dataset + str(int(row['0'])) + '/posters/' + str(int(row['1'])) + '.jpg'
dest = clusters_dir + str(int(row['cluster_' + str(number_of_clusters)]) + 1) + '/' + str(int(row['0'])) + '_' + str(int(row['1'])) + '.jpg'

if os.path.isfile(src):
shutil.copy(src, dest)

print('Done')


collect_posters()

dataset2 = '../../../king-rec-dataset/ml-latest-small/'


def get_items_ids():
item_ids = set()

with open(dataset2 + 'movies.csv', 'r') as movies_file:
reader = csv.reader(movies_file, delimiter=',')
next(reader) # skip header

for row in reader:
item_ids.add(int(row[0]))

return item_ids


def count_movies():
movies = get_items_ids()

idx = 1
for item in movies:
print(idx, item)
idx = idx + 1


# count_movies()
88 changes: 88 additions & 0 deletions dataset/utils/create_clusters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import numpy as np
import matplotlib.pyplot as plt
import csv
import pandas as pd

from sklearn import metrics
from sklearn.cluster import KMeans


dataset = '../../../king-rec-dataset/ml-latest-small/'


def get_items_ids():
item_ids = set()

with open(dataset + 'movies.csv', 'r') as movies_file:
reader = csv.reader(movies_file, delimiter=',')
next(reader) # skip header

for row in reader:
item_ids.add(int(row[0]))

return item_ids


def explore_clusters():
clusters = range(2, 22, 2)
models_results = dict()
colors = ['r', 'y', 'b', 'g', 'c']

models = ['vgg16', 'vgg19', 'inception_v3', 'resnet50', 'NASNet']

for model in models:
print('Reading data ...')
feature_list = np.loadtxt('./posters_features/1000-movies/' + model + '1000-movies_1-posters.csv', delimiter=',')
print('Complete read data.')

movie_poster_clusters = pd.DataFrame(feature_list[:, :2])

feature_list = feature_list[:, 2:]
feature_list_np = np.array(feature_list)
for n_clusters in clusters:
k_means = KMeans(n_clusters=n_clusters).fit(feature_list_np)

name = model
result = metrics.silhouette_score(feature_list_np, k_means.labels_)

if name not in models_results:
results = []
else:
results = models_results.pop(name)

cluster_name = 'cluster_' + str(n_clusters)
movie_poster_clusters[cluster_name] = pd.Series(k_means.labels_)

results.append(result)
models_results.update({name: results})
print('silhouette score on', name, 'with', n_clusters, 'clusters:', result)

movie_poster_clusters.to_csv('movies_1_poster_clusters_' + name + '.csv')

n_groups = len(list(clusters))
index = np.arange(n_groups)
bar_width = 0.15
current_index = 0

for key, values in models_results.items():
plt.bar(index + bar_width * current_index, values, bar_width,
color=colors[current_index],
label=key)
current_index += 1

plt.xlabel('Number of clusters')
plt.ylabel('Silhouette score')
plt.title('Silhouette score by model')
plt.xticks(index + bar_width, list(clusters))
plt.legend()
plt.tight_layout()
plt.savefig('silhouette-score.jpg')
plt.show()


def main():
explore_clusters()


if __name__ == "__main__":
main()
80 changes: 80 additions & 0 deletions dataset/utils/create_clusters_minibatch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import csv
import numpy as np
import pandas as pd

from sklearn.cluster import MiniBatchKMeans


dataset = '../../../king-rec-dataset/ml-latest-small/'


def get_items_ids():
item_ids = set()

with open(dataset + 'movies.csv', 'r') as movies_file:
reader = csv.reader(movies_file, delimiter=',')
next(reader) # skip header

for row in reader:
item_ids.add(int(row[0]))

return item_ids


def explore_clusters():
batch_size = 40

# models = ['vgg16', 'vgg19', 'inception_v3', 'resnet50', 'NASNet']
models = ['resnet50']

for model in models:
# csv_path = './' + model + '-1-posters.csv'
csv_path = './posters_features/sanity-check/' + model + '-sanity-check.csv'

movie_poster_clusters = pd.DataFrame()
for n_clusters in [7]:
final_clusters = pd.Series()
print('Process cluster', n_clusters)

k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=batch_size, compute_labels=True)

reader_chunks = pd.read_csv(csv_path, delimiter=',', header=None, chunksize=batch_size)
for chunk in reader_chunks:
print('Processing chunk ...')

feature_list = pd.DataFrame(data=chunk)

movie_poster_clusters = movie_poster_clusters.append(feature_list.iloc[:, :2])

feature_list = feature_list.iloc[:, 2:]
feature_list_np = np.array(feature_list)

k_means.partial_fit(feature_list_np)

reader_chunks = pd.read_csv(csv_path, delimiter=',', header=None, chunksize=batch_size)
for chunk in reader_chunks:
print('Predicting chunk ...')

feature_list = pd.DataFrame(data=chunk)

feature_list = feature_list.iloc[:, 2:]
feature_list_np = np.array(feature_list)

labels = k_means.predict(feature_list_np)

final_clusters = final_clusters.append(pd.Series(labels))

name = model

cluster_name = 'cluster_' + str(n_clusters)
movie_poster_clusters[cluster_name] = pd.Series(final_clusters.values, index=movie_poster_clusters.index)

movie_poster_clusters.to_csv('test-chunk-movies_1_poster_clusters_' + name + '.csv')


def main():
explore_clusters()


if __name__ == "__main__":
main()
85 changes: 85 additions & 0 deletions dataset/utils/downloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import os
import csv
import sys
import requests
import urllib.request

api_key = sys.argv[1]

dataset = '../../king-rec-dataset/ml-latest-small/'
tmdb_api = 'https://api.themoviedb.org/3/movie/$MOVIE_ID/images?include_image_language=en,null&api_key=$API_KEY'
tmdb_images_url = 'https://image.tmdb.org/t/p/original/'


def get_tmdb_posters(tmdb_api_key, max_movie_index=10):
tmdb_movies_id = get_tmdb_ids()
download_images(tmdb_api_key, tmdb_movies_id, max_movie_index)

return tmdb_movies_id


def download_images(tmdb_api_key, tmdb_movies_id, max_movie_index=10):
images = dataset + 'images/'

movie_index = 1
total_movies = len(tmdb_movies_id.items())

for key, value in tmdb_movies_id.items():
posters = images + str(key) + '/posters/'
backdrops = images + str(key) + '/backdrops/'

if not os.path.exists(posters):
os.makedirs(posters)

if not os.path.exists(backdrops):
os.makedirs(backdrops)

if len(os.listdir(posters)) == 0 or len(os.listdir(backdrops)) == 0:
current_url = tmdb_api.replace('$MOVIE_ID', str(value)).replace('$API_KEY', tmdb_api_key)
response = requests.get(current_url)

if response.status_code == 200:
json = response.json()

if len(os.listdir(posters)) == 0:
image_idx = 1
for poster in json['posters']:
if poster['iso_639_1'] == 'en':
print(movie_index, '/', total_movies, '- Process movie', value, 'and poster', image_idx)
poster_url = poster['file_path']
urllib.request.urlretrieve(tmdb_images_url + poster_url, posters + str(image_idx) + '.jpg')
image_idx += 1

if len(os.listdir(backdrops)) == 0:
image_idx = 1
for backdrop in json['backdrops']:
if backdrop['iso_639_1'] == 'xx' or backdrop['iso_639_1'] is None:
print(movie_index, '/', total_movies, '- Process movie', value, 'and backdrop', image_idx)
backdrop_url = backdrop['file_path']
urllib.request.urlretrieve(tmdb_images_url + backdrop_url,
backdrops + str(image_idx) + '.jpg')
image_idx += 1

else:
print('Status code:', response.status_code, 'on movie', key, '-', value)

if movie_index == max_movie_index:
break

movie_index += 1


def get_tmdb_ids(tmdb_index=2):
links = dataset + 'links.csv'
with open(links, 'r') as links_file:
reader = csv.reader(links_file, delimiter=',', )
next(reader) # skip header

tmdb_movies_id = dict()
for row in reader:
tmdb_movies_id.update({row[0]: row[tmdb_index]})

return tmdb_movies_id


get_tmdb_posters(api_key, max_movie_index=20)
81 changes: 81 additions & 0 deletions dataset/utils/extract_features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import os
import numpy as np
import csv
import pandas as pd

from keras.layers import Input
from keras.preprocessing import image
from keras.applications.vgg16 import VGG16
from keras.applications.vgg19 import VGG19
from keras.applications.inception_v3 import InceptionV3
from keras.applications.resnet50 import ResNet50
from keras.applications.nasnet import NASNetLarge
from keras.applications.imagenet_utils import preprocess_input

dataset = '../../../king-rec-dataset/ml-latest-small/'
base_path = 'images/'
# base_path = 'clusters_sanity_check/'
max_posters_per_movie = 1


def get_int(filename):
return int(filename.split('.')[0])


def get_items_ids():
item_ids = set()

with open(dataset + 'movies.csv', 'r') as movies_file:
reader = csv.reader(movies_file, delimiter=',')
next(reader) # skip header

for row in reader:
item_ids.add(int(row[0]))

return item_ids


def extract_images_features():
movies = list(get_items_ids())
# movies = [1, 3, 4, 5, 7, 19, 22, 23]
subdir = [dataset + base_path + str(movie) + '/posters/' for movie in movies]
models = [
VGG16(weights='imagenet', include_top=False),
VGG19(weights='imagenet', include_top=False),
InceptionV3(weights='imagenet', include_top=False),
ResNet50(weights='imagenet', include_top=False),
NASNetLarge(weights='imagenet', include_top=False, input_tensor=Input(shape=(224, 224, 3)))
]
total_movies = len(subdir)
for current_movie, dirname in enumerate(subdir):
movie_idx = int([s for s in dirname.split('/') if s.isdigit()][0])
filenames = sorted(os.listdir(dirname), key=get_int)[0:max_posters_per_movie]

for _, file_name in enumerate(filenames):
poster_idx = int(file_name.split('.')[0])

img = image.load_img(dirname + '/' + file_name, target_size=(224, 224))
img_data = image.img_to_array(img)
img_data = np.expand_dims(img_data, axis=0)
img_data = preprocess_input(img_data)

for model in models:
feature = model.predict(img_data)
feature_np = np.array(feature)
feature = feature_np.flatten()

data_to_save = np.append([movie_idx, poster_idx], feature)
data = pd.DataFrame([data_to_save])
data.to_csv(model.name + '-' + str(max_posters_per_movie) + '-posters' + '.csv',
mode='a', sep=',', index=False, header=False)

print(str(current_movie + 1) + '/' + str(total_movies) + ':', 'movie id:', movie_idx, ' poster id:', poster_idx,
' model name:', model.name, ' total features:', len(feature))


def main():
extract_images_features()


if __name__ == "__main__":
main()
Loading

0 comments on commit a2a92b5

Please sign in to comment.