目录

itemCF & userCF

ItemCF(Item-based Collaborative Filtering基于物品的协同过滤)

给用户推荐他之前喜欢的物品相似的物品

关键:计算用户u对物品j的兴趣
学术上:

/images/recommenderSystem/itemcf.png

工业上:
活跃用户应该被降低在相似度公式中的贡献度

/images/recommenderSystem/itemcf1.png

用户在不同时间对item的操作应给予时间衰减惩罚

/images/recommenderSystem/itemcf2.png

Δt表示用户对物品u发生行为距对物品i发生行为的时间差

算法实现:

  1. 建立用户物品倒排表 {user:{item:score}}
  2. 计算共现矩阵C {item:{item:times}}
  3. 计算余弦相似度矩阵W [item][item]
  4. 根据物品的相似度和用户的历史行为给用户生成推荐列表
#-*-coding:utf-8-*-

import math

class ItemBasedCF:
    def __init__(self,train_file):
        self.train_file = train_file
        self.readData()
    def readData(self):
        #读取文件,并生成用户-物品的评分表和测试集
        self.train = {}     #用户-物品的评分表  全局变量
        with open(self.train_file,'r') as f:
            # 跳过第一行
            next(f)
            for line in f.readlines():
                user,item,score,time = line.strip().split(",")
                self.train.setdefault(user,{})
                self.train[user][item] = int(float(score))

    def ItemSimilarity(self):
        #建立物品-物品的共现矩阵
        C = {}  #物品-物品的共现矩阵
        N = {}  #物品被多少个不同用户购买
        for user,items in self.train.items():
            for i in items.keys():
                N.setdefault(i,0)
                N[i] += 1
                C.setdefault(i,{})
                for j in items.keys():
                    if i == j : continue
                    C[i].setdefault(j,0)
                    C[i][j] += 1
        #计算相似度矩阵
        self.W = {}
        for i,related_items in C.items():
            self.W.setdefault(i,{})
            for j,cij in related_items.items():
                self.W[i][j] = cij / (math.sqrt(N[i] * N[j]))
        return self.W
    #给用户user推荐N个物品
    def Recommend(self,user,K=3,N=10):
        rank = {}
        action_item = self.train[user]     #用户user产生过行为的item和评分
        for item,score in action_item.items():
            for j,wj in sorted(self.W[item].items(),key=lambda x:x[1],reverse=True)[0:K]:
                if j in action_item.keys():
                    continue
                rank.setdefault(j,0)
                rank[j] += score * wj
        return dict(sorted(rank.items(),key=lambda x:x[1],reverse=True)[0:N])


# 列出用户1观看过的电影

# with open("D:/document/Python/ml-latest-small/ratings.csv", 'r') as f1:
#     next(f1)
#     for line in f1.readlines():
#         userId,movie,rating,timestamp = line.strip().split(",")
#         if (userId == '1'):
#             with open("D:/document/Python/ml-latest-small/movies.csv", 'r', encoding='UTF-8') as f2:
#                 next(f2)
#                 for line2 in f2.readlines():
#                     item = line2.strip().split(",")
#                     if len(item) > 3:
#                         continue
#                     [movieId,title,genres] = item
#                     if movieId == movie:
#                         print(title,'\t',genres)


# 声明一个ItemBased推荐的对象    
Item = ItemBasedCF("D:/document/Python/ml-latest-small/ratings.csv")
Item.ItemSimilarity()
recommedDic = Item.Recommend("1")
for k,v in recommedDic.items():
    print(k,"\t",v)

UserCF(User-based Collaborative Filtering基于用户的协同过滤)

给用户推荐相似兴趣用户感兴趣的物品

关键:寻找相似兴趣用户的集合,找到集合用户感兴趣而目标用户没行为过的item
学术上:
/images/recommenderSystem/usercf.png

工业上: 降低那些异常活跃物品对用户相似度的贡献
/images/recommenderSystem/usercf1.png

不同用户对同一item行为的时间段不同应该给予时间惩罚
/images/recommenderSystem/usercf2.png

算法实现:

  1. 建立物品用户倒排表 {item:{user:score}}
  2. 计算共现矩阵C {user:{user:times}}
  3. 计算余弦相似度矩阵W [user][user]
  4. 根据物品的相似度和用户的历史行为给用户生成推荐列表
#-*-coding:utf-8-*-

import math

class UserBasedCF:
    def __init__(self,train_file):
        self.train_file = train_file
        self.readData()
    def readData(self):
        #读取文件,并生成物品-用户的评分表和测试集
        self.item2user = {}     #物品-用户的评分表  全局变量
        self.user2item = {}
        with open(self.train_file,'r') as f:
            # 跳过第一行
            next(f)
            for line in f.readlines():
                user,item,score,time = line.strip().split(",")
                self.item2user.setdefault(item,{})
                self.item2user[item][user] = int(float(score))
                self.user2item.setdefault(user,{})
                self.user2item[user][item] = int(float(score))

    def UserSimilarity(self):
        #建立用户-用户的共现矩阵
        C = {}  #用户-用户的共现矩阵
        N = {}  #用户出现总次数
        for user,items in self.item2user.items():
            for i in items.keys():
                N.setdefault(i,0)
                N[i] += 1
                C.setdefault(i,{})
                for j in items.keys():
                    if i == j : continue
                    C[i].setdefault(j,0)
                    C[i][j] += 1
        #计算相似度矩阵
        self.W = {}
        for i,related_users in C.items():
            self.W.setdefault(i,{})
            for j,cij in related_users.items():
                self.W[i][j] = cij / (math.sqrt(N[i] * N[j]))
            self.W[i] = dict(sorted(self.W[i].items(),key = lambda x:x[1], reverse=True))

        return self.W

    #给用户user推荐与其最相似的K个用户中推荐N个物品
    # 从K个相似用户中找出user中不存在的item,然后打分后排序推荐给user
    def Recommend(self,user,K=5,N=10):
        rank = {}
        related_user = list(self.W[user].keys())[0:K]
        for u in related_user:
            for i,w in self.user2item[u].items():
                if i not in self.user2item[user].keys():
                    rank.setdefault(i,0)
                    rank[i] += self.W[user][u] * w

        return(dict(sorted(rank.items(),key = lambda x:x[1],reverse=True)[0:N]))

# 列出用户user观看过的高分(5分以上)电影
def list_movies(user,rate=5):
    with open("D:/document/Python/ml-latest-small/ratings.csv", 'r') as f1:
        next(f1)
        for line in f1.readlines():
            userId,movie,rating,timestamp = line.strip().split(",")
            if userId == user and float(rating) >= rate:
                with open("D:/document/Python/ml-latest-small/movies.csv", 'r', encoding='UTF-8') as f2:
                    next(f2)
                    for line2 in f2.readlines():
                        item = line2.strip().split(",")
                        if len(item) > 3:
                            continue
                        [movieId,title,genres] = item
                        if movieId == movie:
                            print(rating,'\t',title,'\t\t\t',genres)

# 列出用户1评分5分以上的电影
# list_movies('1',5)

# 声明一个ItemBased推荐的对象
UserCF = UserBasedCF("D:/document/Python/ml-latest-small/ratings.csv")
UserCF.UserSimilarity()

recommedDic = UserCF.Recommend("1")
for k,v in recommedDic.items():
    print(k,"\t",v)