基于内容的推荐
目录
基于内容的推荐 Content Based
特点:
- 思路简单,可解释性强
- 用户推荐具有独立性,不受其他用户的影响
缺点:
- 广度较窄,过多地关注用户喜好的物品
- 要积累一定量的用户行为
步骤:
- 对物品进行刻画 item profile
- topic finding
- genre classify
- 对用户进行刻画 user profile
- genre/topic
- time decay
- 在线推荐
- find top k genre/topic
- get the best n item from fixed genre/topic
import os
# 获得物品平均评分
def get_ave_score(input_file):
if not os.path.exists(input_file):
return {}
record = {}
ave_score = {}
with open(input_file,'r') as f:
next(f)
for line in f:
item = line.strip().split(",")
if len(item) < 4:
continue
userid, itemid, rating = item[0], item[1], float(item[2])
if itemid not in record:
record[itemid] = [0, 0] #[总评分,评分人数]
record[itemid][0] += rating
record[itemid][1] += 1
for itemid in record:
ave_score[itemid] = round(record[itemid][0]/record[itemid][1],3)
return ave_score
# 获得物品类别,并每一类别中的所有物品进行倒排序
def get_item_cate(ave_score, input_file):
if not os.path.exists(input_file):
return {},{}
topK = 100
# 每个物品的种类
item_cate = {}
# 每个种类下物品的平均分
record = {}
# 倒排
cate_item_sort = {}
with open(input_file, "r", encoding="utf-8") as f:
next(f)
for line in f:
item = line.strip().split(",")
if len(item) < 3:
continue
itemid = item[0]
cate_str = item[-1]
cate_list = cate_str.strip().split("|")
# 因每个物品有多种类别,所以每种类别的比重均分
ratio = round(1/len(cate_list),3)
if itemid not in item_cate:
item_cate[itemid] = {}
for fix_cate in cate_list:
item_cate[itemid][fix_cate] = ratio
for itemid in item_cate:
for cate in item_cate[itemid]:
if cate not in record:
record[cate] = {}
itemid_rating_score = ave_score.get(itemid, 0)
record[cate][itemid] = itemid_rating_score
for cate in record:
if cate not in cate_item_sort:
cate_item_sort[cate] = []
for top_item in sorted(record[cate].items(), key = lambda x:x[1], reverse=True)[:topK]:
cate_item_sort[cate].append(top_item[0])
return item_cate, cate_item_sort
def get_up(item_cate, input_file):
if not os.path.exists(input_file):
return {}
score_thr = 4.0
topK = 2
record = {}
# 输出
up = {}
with open(input_file, "r") as f:
next(f)
for line in f:
item = line.strip().split(",")
if len(item) < 4:
continue
userid, itemid, rating, timestamp = item[0], item[1], float(item[3]), int(item[3])
if rating <= score_thr:
continue
if itemid not in item_cate:
continue
time_score = get_time_score(timestamp)
if userid not in record:
record[userid] = {}
for fix_cate in item_cate[itemid]:
if fix_cate not in record[userid]:
record[userid][fix_cate] = 0
record[userid][fix_cate] += rating * time_score * item_cate[itemid][fix_cate]
for userid in record:
if userid not in up:
up[userid] = []
total_score = 0
for item in sorted(record[userid].items(), key = lambda x : x[1], reverse=True)[:topK]:
up[userid].append((item[0],item[1]))
total_score += item[1]
for index in range(len(up[userid])):
up[userid][index] = (up[userid][index][0], round(up[userid][index][1]/total_score, 3))
return up
def get_time_score(timestamp):
fix_time_stamp = 1494273047
# total_sec = 24*60*60
delta = (fix_time_stamp - timestamp) / fix_time_stamp
return round(1/(1+delta), 3)
def recom(cate_item_sort, up, userid, topK = 10):
if userid not in up:
return {}
recom_result = {}
if userid not in recom_result:
recom_result[userid] = []
for itera in up[userid]:
cate = itera[0]
ratio = itera[1]
num = int(topK*ratio) + 1
if cate not in cate_item_sort:
continue
recom_list = cate_item_sort[cate][:num]
recom_result[userid] += recom_list
return recom_result
if __name__ == "__main__":
# 测试
# ave_score = get_ave_score("D:/document/Python/ml-latest-small/ratings.csv")
# print(len(ave_score))
# print(ave_score['100'])
# item_cate, cate_item_sort = get_item_cate(ave_score, "D:/document/Python/ml-latest-small/movies.csv")
# print(item_cate["1"])
# print(cate_item_sort["Children"])
ave_score = get_ave_score("D:/document/Python/ml-latest-small/ratings.csv")
item_cate, cate_item_sort = get_item_cate(ave_score, "D:/document/Python/ml-latest-small/movies.csv")
up = get_up(item_cate, "D:/document/Python/ml-latest-small/ratings.csv")
print(len(up))
print(up["1"]) # 用户1的刻画
print(recom(cate_item_sort, up, "1")) # 用户1的推荐结果