目录

LFM

隐语义模型LFM(latent factor model)

参考文章https://blog.csdn.net/sinat_22594309/article/details/86576757https://blog.csdn.net/harryhuang1990/article/details/9924377

简单来说就是对物品自动进行分类,通过算法来计算用户对每一类的喜好权重(其中类别数即为预设的LFM维度)
通过LFM算法可以得到用户的topLike(通过计算user*item得出用户最喜爱的几个物品)、item的topSimilar(通过计算item向量之间的距离得到最相似的几个物品)、item的topic(通过聚类可得到物品类别)

用户u对物品i的喜好程度可用公式表示为:$p^{LFM}(u,i)=p_u^Tq_i=\sum_{f=1}^Fp_{uf}q_{if}$

损失函数(加入LL2正则化项):$\sum_{(u,i)\in D}(p(u,i)-p^{LFM}(u,i))^2 + \lambda ||p_u||^2 + \lambda ||q_i||^2$

步骤:

  1. 选取k个维度
  2. p和q随机初始化
import os
import numpy as np
import operator

# 提取电影信息 {id:[name,genre]}
def get_item_info(input_file):
    if not os.path.exists(input_file):
        return {}
    item_info = {}
    with open(input_file,encoding='utf-8') as f:
        next(f)  #跳过第一行标题
        for line in f:
            item = line.strip().split(",")
            if len(item) < 3:
                continue
            elif len(item) == 3:
                itemid,title,genre = item
            else:
                itemid = item[0]
                # 对于名字间有逗号的电影 American President, The (1995) 特别处理
                title = ','.join(item[1:-1]).strip('"')
                genre = item[-1]
            item_info[itemid] = [title,genre]
    return item_info

# 每部电影的平均分数 {id:score}
def get_ave_score(input_file):
    if not os.path.exists(input_file):
        return {}
    record_dict = {}  # 中间变量,记录item的打分人数和总分
    score_dict = {}   # 最后结果
    with open(input_file,encoding='utf-8') as f:
        next(f)  #跳过第一行标题
        for line in f:
            item = line.strip().split(",")
            if len(item) < 4:
                continue
            userid,itemid,rating,timestap = item
            if itemid not in record_dict:
                record_dict[itemid] = [0,0]
            record_dict[itemid][0] += 1
            record_dict[itemid][1] += float(rating)
    for itemid in record_dict:
        score_dict[itemid] = round(record_dict[itemid][1]/record_dict[itemid][0],3)
    return score_dict

# 为LFM提供训练样本
def get_train_data(input_file):
    if not os.path.exists(input_file):
        return []
    neg_dict = {}
    pos_dict = {}
    train_data = []
    score_thr = 4.0  #正负样本阈值
    # 获取正负样本
    score_dict = get_ave_score(input_file)
    with open(input_file,encoding='utf-8') as f:
        next(f)
        for line in f:
            item = line.strip().split(",")
            if len(item) < 4:
                continue
            userid,itemid,rating,timestap = item
            if userid not in pos_dict:
                pos_dict[userid] = []
            if userid not in neg_dict:
                neg_dict[userid] = []
            if float(rating) >= score_thr:   # 正负样本阈值
                pos_dict[userid].append(itemid)
            else:
                score = score_dict.get(itemid,0) #获取平均分数,若没有则默认0分
                neg_dict[userid].append((itemid,score))
    # 获取正负样本的均衡负采样
    for userid in pos_dict:
        data_num = min(len(pos_dict[userid]),len(neg_dict.get(userid,[])))
        if data_num > 0:
            train_data += [(userid,item,1) for item in pos_dict[userid]][:data_num]
        else:
            continue
        # 对负样本按照平均评分进行排序,表示若用户如果对热门的商品不喜欢,则很大可能这个用户对这个物品真的没有兴趣
        sorted_neg_list = sorted(neg_dict[userid],key = lambda x:x[1],reverse=True)[:data_num]
        train_data += [(userid,item[0],0) for item in sorted_neg_list]
    return train_data

# LFM训练函数 分类维度F,正则化参数λ,学习速率α,迭代次数step
# 返回用户向量和物品向量
def lfm_train(train_data,F,lamda,alpha,step):
    user_vec = {}
    item_vec = {}
    for step_index in range(step):
        print("迭代"+str(step_index+1)+"次,共"+str(step)+"次")
        for data_instance in train_data:
            userid,itemid,label = data_instance
            if userid not in user_vec:
                user_vec[userid] = init_model(F) # 若userid第一次训练则进行初始化
            if itemid not in item_vec:
                item_vec[itemid] = init_model(F)
            delta = label - model_predict(user_vec[userid],item_vec[itemid])
            # for index in range(F):
            #     user_vec[userid][index] += alpha*(delta*item_vec[itemid][index] - lamda*user_vec[userid][index])
            #     item_vec[itemid][index] += alpha*(delta*user_vec[userid][index] - lamda*item_vec[itemid][index]) 
            user_vec[userid] += alpha*(delta*item_vec[itemid] - lamda*user_vec[userid])
            item_vec[itemid] += alpha*(delta*user_vec[userid] - lamda*item_vec[itemid])      
            alpha = alpha * 0.9
    return user_vec,item_vec

# 初始化向量
def init_model(vector_len):
    return np.random.randn(vector_len)

# 计算模型预测值
def model_predict(user_vec,item_vec):
    res = np.dot(user_vec,item_vec)/(np.linalg.norm(user_vec)*np.linalg.norm(item_vec))
    # 除以二范数是为了归一化
    return res

# 给用户userid进行推荐,返回列表[(item,score),...]
def recommend(userid):
    res_num = 10  # 推荐数量
    record = {}
    recom_list = []  # 推荐结果
    train_data = get_train_data("D:/document/Python/ml-latest-small/ratings.csv")
    user_vec,item_vec = lfm_train(train_data,50,0.01,0.1,50)

    for itemid in item_vec:
        res = np.dot(user_vec[userid],item_vec[itemid])/(np.linalg.norm(user_vec[userid])*np.linalg.norm(item_vec[itemid]))
        # 除以二范数是为了归一化
        record[itemid] = res
    for item in sorted(record.items(), key=operator.itemgetter(1), reverse=True)[:res_num]:
        itemid = item[0]
        score = round(item[1],3)
        recom_list.append((itemid,score))
    return recom_list

# 推荐结果分析
def analysis_recom_result(train_data,userid,recom_list):
    item_info = get_item_info("D:/document/Python/ml-latest-small/movies.csv")
    for data_instance in train_data:
        tmp_userid,itemid,label = data_instance
        # 输出用户userid喜欢的电影
        if tmp_userid == userid and label == 1:
            print(item_info[itemid])
    # 输出推荐结果
    print("recom rersult:")
    for item in recom_list:
        print(item_info[item[0]])

if __name__ == '__main__':

    recom_list = recommend('1')
    # 输出推荐列表
    print(recom_list)
    # 推荐结果分析
    analysis_recom_result(get_train_data("D:/document/Python/ml-latest-small/ratings.csv"),'1',recom_list)

    # get_item_info函数测试
    # item_dict = get_item_info("D:/document/Python/ml-latest-small/movies.csv")
    # print(len(item_dict))
    # print(item_dict['11'][0])

    # get_ave_score函数测试
    # score_dict = get_ave_score("D:/document/Python/ml-latest-small/ratings.csv")
    # print(len(score_dict))
    # print(score_dict['31'])

    # get_train_data函数测试
    # train_data = get_train_data("D:/document/Python/ml-latest-small/ratings.csv")
    # print(len(train_data))
    # print(train_data[:50])

    # print([i for i in range(10)])