Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
pages/search_sol.py
*.pyc
*.csv
.data/
.data/
/user_data
/.taipy
/.vscode
1 change: 1 addition & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
"program": "${file}",
"console": "integratedTerminal",
"justMyCode": true,
"env":{"GEVENT_SUPPORT":"TRUE"}
}
]
}
54 changes: 26 additions & 28 deletions algos/algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,24 @@
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import time

def dumb(a,b,c):
...
from algos.recommender_algos import process_title, give_recommendations

def recommend_films_to_user(a,b,c):
start = time.time()
recommended_content = []
if len(a) > 0:
all_film_bag = ' '.join([process_title(film, movies) for film in a])

recs = give_recommendations(all_film_bag, 30, movies_for_recommendation)
for rec in recs:
if (b.count(rec) == 0) and (c.count(rec) == 0):
recommended_content.append(rec)
if len(recommended_content) >= 10:
break
print("time for recommend_films_to_user", time.time() - start)
return recommended_content

def clean_title(title):
title = re.sub("[^a-zA-Z0-9 ]", "", title)
Expand All @@ -15,26 +30,12 @@ def search(title):
title = clean_title(title)
query_vec = vectorizer.transform([title])
similarity = cosine_similarity(query_vec, tfidf).flatten()
indices = np.argpartition(similarity, -5)[-5:]
indices = np.argpartition(similarity, -5)[-4:]
results = movies.iloc[indices].iloc[::-1]
print(results)
return results

def get_rating(movie_id):
try:
if int(movie_id) in ratings["movieId"].unique():
return {"Ratings" : list(ratings[ratings["movieId"] == int(movie_id)]["rating"])}
else:
return {"Ratings" : [0,1,2]}
except Exception as e:
print(movie_id, e)
return {"Ratings" : [0,1,2]}


def mean_rating(selected_film):
return float(np.mean(get_rating(selected_film)['Ratings']))

def find_similar_movies(movie_id):
def find_similar_movies(movie_id, num_rec=10):
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
Expand All @@ -47,19 +48,16 @@ def find_similar_movies(movie_id):

rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
rec_percentages = rec_percentages.sort_values("score", ascending=False)
return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]
return rec_percentages.head(num_rec).merge(movies.reset_index(drop = True), left_index=True, right_on="movieId")[["score", "title", "genres", "movieId"]]


vectorizer = TfidfVectorizer(ngram_range=(1,2))
#movie_id = 89745

ratings = pd.read_csv('data/ratings.csv')
ratings = pd.read_parquet('data/ratings')

movies = pd.read_csv("data/movies.csv")
movies["clean_title"] = movies["title"].apply(clean_title)
tfidf = vectorizer.fit_transform(movies["clean_title"])
movies = pd.read_parquet('data/augmented_movies.parquet')

#print(search('Avengers'))
#start=time.time()
#print(find_similar_movies(movie_id))
#print(time.time()-start)
movies["clean_title"] = movies["title"].apply(clean_title)
movies.index = movies['movieId']
movies_for_recommendation = movies[movies['Nb ratings']>=5_000]
tfidf = vectorizer.fit_transform(movies["clean_title"])
65 changes: 65 additions & 0 deletions algos/recommender_algos.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import nltk
from rake_nltk import Rake

def extract_keywords(text):
'''this function extracts keywords from movie description using nltk Rake'''
r = Rake()

r.extract_keywords_from_text(text)

# getting the dictionary whith key words as keys and their scores as values
key_words_dict_scores = r.get_word_degrees()

key_words = list(key_words_dict_scores.keys())

return key_words

def process_title(title, movie_df):
'''this function, given a movie title, retrieves movie info from the movie database, and outputs it as a string to be processed by the vectorizer '''
movie_bag = movie_df[movie_df.title == title]['Bagofwords'].values[0]
return movie_bag

def give_recommendations(movie_bagofwords, num_rec, movie_df):
'''given a movie bag_of_words and the desired number of recommendations, this function returns an array of recommended movie titles'''


vectorizer = CountVectorizer()
count_matrix = vectorizer.fit_transform(movie_df['Bagofwords'])
count_array = count_matrix.toarray()
count_df = pd.DataFrame(data=count_array,columns = vectorizer.get_feature_names_out())

v = vectorizer.transform([movie_bagofwords])
cos_sim = cosine_similarity(count_df,v)
cos_sim = cos_sim.flatten()
sorted_cos_sim = np.argsort(cos_sim)
reverse_sorted = sorted_cos_sim[::-1]
rec_indices = reverse_sorted[:int(num_rec)]
idx = list(count_df.iloc[rec_indices,0].index)
recommendations = movie_df.iloc[idx,:].reset_index(drop=True)
recommendations = [recommendations['movieId'][i] for i in range(len(recommendations))]
return recommendations


if __name__ == '__main__':
import sys

if len(sys.argv) != 5:
print("Usage: function requires a movie title, number of desired recommendations, movie database file name, and the top 1000 IMDB movie file")
sys.exit()


MOVIE_TITLE = sys.argv[1]
NUM_REC = sys.argv[2]
MOVIE_FILE = sys.argv[3]
TOP_1000_FILE = sys.argv[4]


movie_df = pd.read_csv(MOVIE_FILE)
movie_recommendations = give_recommendations(MOVIE_TITLE, NUM_REC)#
print(movie_recommendations)
21 changes: 21 additions & 0 deletions algos/recommender_main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import pandas as pd

import taipy as tp
from taipy import Config

### recommender_data
MOVIES_FILE = 'data/augmented_movies.parquet'

movie_df = pd.read_parquet(MOVIES_FILE)

Config.load("config/config_recommender.toml")
scenario_cfg = Config.scenarios['scenario']

if __name__ == "__main__":
# run Taipy Core
tp.Core().run()

# create my scenario
scenario = tp.create_scenario(scenario_cfg)
tp.submit(scenario)
print("We recommend:", scenario.recommendations.read())
63 changes: 63 additions & 0 deletions api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import requests
import pandas as pd
import time
import concurrent.futures

api_key = "15d2ea6d0dc1d476efbca3eba2b9bbfb"

links_tmdbId = pd.read_csv('data/links.csv')
links_tmdbId.dropna(inplace=True)
links_tmdbId.index = links_tmdbId.tmdbId.astype(int)
links_movieId = pd.read_csv('data/links.csv')
links_movieId.index = links_movieId.movieId.astype(int)


def fetch_api_in_parallel(tmdb_ids):
with concurrent.futures.ThreadPoolExecutor() as executor:
results = executor.map(fetch_api_film, tmdb_ids)
return list(results)

def fetch_api_film(tmdb_id):
start = time.time()
url = f"https://api.themoviedb.org/3/movie/{tmdb_id}?language=en-US&api_key={api_key}"
response = requests.get(url)
if response.status_code == 200:
results = response.json()
if results:
print(f"Data fetched in {time.time() - start:.2f} seconds.")
return results
else:
print(f"Error: Unable to fetch data. Status code: {response.status_code}")
return None

def search_api_film(query):
url = f"https://api.themoviedb.org/3/search/movie?api_key={api_key}&query={query}"
response = requests.get(url)
if response.status_code == 200:
results = response.json()
if results and 'results' in results:
return results['results']
else:
print(f"Error: Unable to fetch data. Status code: {response.status_code}")
return None


def transform_tmdb_id_to_movie_id(tmdb_id):
try:
movie_id = links_tmdbId.loc[int(tmdb_id), 'movieId']
except:
movie_id = -1
return str(int(movie_id))

def transform_movie_id_to_tmdb_id(movie_id):
tmdb_id = links_movieId.loc[int(movie_id), 'tmdbId']
return int(tmdb_id)

if __name__ == '__main__':
tmdb_id = 8966
movie_id = transform_tmdb_id_to_movie_id(tmdb_id)
tmdb_id_retransformed = transform_movie_id_to_tmdb_id(movie_id)
print(fetch_api_film(tmdb_id_retransformed))
print(tmdb_id)
print(movie_id)
print(tmdb_id_retransformed)
Binary file removed assets/.DS_Store
Binary file not shown.
Loading