LFM
目录
隐语义模型LFM(latent factor model)
参考文章https://blog.csdn.net/sinat_22594309/article/details/86576757、https://blog.csdn.net/harryhuang1990/article/details/9924377
简单来说就是对物品自动进行分类,通过算法来计算用户对每一类的喜好权重(其中类别数即为预设的LFM维度)
通过LFM算法可以得到用户的topLike(通过计算user*item得出用户最喜爱的几个物品)、item的topSimilar(通过计算item向量之间的距离得到最相似的几个物品)、item的topic(通过聚类可得到物品类别)
用户u对物品i的喜好程度可用公式表示为:$p^{LFM}(u,i)=p_u^Tq_i=\sum_{f=1}^Fp_{uf}q_{if}$
损失函数(加入LL2正则化项):$\sum_{(u,i)\in D}(p(u,i)-p^{LFM}(u,i))^2 + \lambda ||p_u||^2 + \lambda ||q_i||^2$
步骤:
- 选取k个维度
- p和q随机初始化
import os
import numpy as np
import operator
# 提取电影信息 {id:[name,genre]}
def get_item_info(input_file):
if not os.path.exists(input_file):
return {}
item_info = {}
with open(input_file,encoding='utf-8') as f:
next(f) #跳过第一行标题
for line in f:
item = line.strip().split(",")
if len(item) < 3:
continue
elif len(item) == 3:
itemid,title,genre = item
else:
itemid = item[0]
# 对于名字间有逗号的电影 American President, The (1995) 特别处理
title = ','.join(item[1:-1]).strip('"')
genre = item[-1]
item_info[itemid] = [title,genre]
return item_info
# 每部电影的平均分数 {id:score}
def get_ave_score(input_file):
if not os.path.exists(input_file):
return {}
record_dict = {} # 中间变量,记录item的打分人数和总分
score_dict = {} # 最后结果
with open(input_file,encoding='utf-8') as f:
next(f) #跳过第一行标题
for line in f:
item = line.strip().split(",")
if len(item) < 4:
continue
userid,itemid,rating,timestap = item
if itemid not in record_dict:
record_dict[itemid] = [0,0]
record_dict[itemid][0] += 1
record_dict[itemid][1] += float(rating)
for itemid in record_dict:
score_dict[itemid] = round(record_dict[itemid][1]/record_dict[itemid][0],3)
return score_dict
# 为LFM提供训练样本
def get_train_data(input_file):
if not os.path.exists(input_file):
return []
neg_dict = {}
pos_dict = {}
train_data = []
score_thr = 4.0 #正负样本阈值
# 获取正负样本
score_dict = get_ave_score(input_file)
with open(input_file,encoding='utf-8') as f:
next(f)
for line in f:
item = line.strip().split(",")
if len(item) < 4:
continue
userid,itemid,rating,timestap = item
if userid not in pos_dict:
pos_dict[userid] = []
if userid not in neg_dict:
neg_dict[userid] = []
if float(rating) >= score_thr: # 正负样本阈值
pos_dict[userid].append(itemid)
else:
score = score_dict.get(itemid,0) #获取平均分数,若没有则默认0分
neg_dict[userid].append((itemid,score))
# 获取正负样本的均衡负采样
for userid in pos_dict:
data_num = min(len(pos_dict[userid]),len(neg_dict.get(userid,[])))
if data_num > 0:
train_data += [(userid,item,1) for item in pos_dict[userid]][:data_num]
else:
continue
# 对负样本按照平均评分进行排序,表示若用户如果对热门的商品不喜欢,则很大可能这个用户对这个物品真的没有兴趣
sorted_neg_list = sorted(neg_dict[userid],key = lambda x:x[1],reverse=True)[:data_num]
train_data += [(userid,item[0],0) for item in sorted_neg_list]
return train_data
# LFM训练函数 分类维度F,正则化参数λ,学习速率α,迭代次数step
# 返回用户向量和物品向量
def lfm_train(train_data,F,lamda,alpha,step):
user_vec = {}
item_vec = {}
for step_index in range(step):
print("迭代"+str(step_index+1)+"次,共"+str(step)+"次")
for data_instance in train_data:
userid,itemid,label = data_instance
if userid not in user_vec:
user_vec[userid] = init_model(F) # 若userid第一次训练则进行初始化
if itemid not in item_vec:
item_vec[itemid] = init_model(F)
delta = label - model_predict(user_vec[userid],item_vec[itemid])
# for index in range(F):
# user_vec[userid][index] += alpha*(delta*item_vec[itemid][index] - lamda*user_vec[userid][index])
# item_vec[itemid][index] += alpha*(delta*user_vec[userid][index] - lamda*item_vec[itemid][index])
user_vec[userid] += alpha*(delta*item_vec[itemid] - lamda*user_vec[userid])
item_vec[itemid] += alpha*(delta*user_vec[userid] - lamda*item_vec[itemid])
alpha = alpha * 0.9
return user_vec,item_vec
# 初始化向量
def init_model(vector_len):
return np.random.randn(vector_len)
# 计算模型预测值
def model_predict(user_vec,item_vec):
res = np.dot(user_vec,item_vec)/(np.linalg.norm(user_vec)*np.linalg.norm(item_vec))
# 除以二范数是为了归一化
return res
# 给用户userid进行推荐,返回列表[(item,score),...]
def recommend(userid):
res_num = 10 # 推荐数量
record = {}
recom_list = [] # 推荐结果
train_data = get_train_data("D:/document/Python/ml-latest-small/ratings.csv")
user_vec,item_vec = lfm_train(train_data,50,0.01,0.1,50)
for itemid in item_vec:
res = np.dot(user_vec[userid],item_vec[itemid])/(np.linalg.norm(user_vec[userid])*np.linalg.norm(item_vec[itemid]))
# 除以二范数是为了归一化
record[itemid] = res
for item in sorted(record.items(), key=operator.itemgetter(1), reverse=True)[:res_num]:
itemid = item[0]
score = round(item[1],3)
recom_list.append((itemid,score))
return recom_list
# 推荐结果分析
def analysis_recom_result(train_data,userid,recom_list):
item_info = get_item_info("D:/document/Python/ml-latest-small/movies.csv")
for data_instance in train_data:
tmp_userid,itemid,label = data_instance
# 输出用户userid喜欢的电影
if tmp_userid == userid and label == 1:
print(item_info[itemid])
# 输出推荐结果
print("recom rersult:")
for item in recom_list:
print(item_info[item[0]])
if __name__ == '__main__':
recom_list = recommend('1')
# 输出推荐列表
print(recom_list)
# 推荐结果分析
analysis_recom_result(get_train_data("D:/document/Python/ml-latest-small/ratings.csv"),'1',recom_list)
# get_item_info函数测试
# item_dict = get_item_info("D:/document/Python/ml-latest-small/movies.csv")
# print(len(item_dict))
# print(item_dict['11'][0])
# get_ave_score函数测试
# score_dict = get_ave_score("D:/document/Python/ml-latest-small/ratings.csv")
# print(len(score_dict))
# print(score_dict['31'])
# get_train_data函数测试
# train_data = get_train_data("D:/document/Python/ml-latest-small/ratings.csv")
# print(len(train_data))
# print(train_data[:50])
# print([i for i in range(10)])