itemCF & userCF
目录
ItemCF(Item-based Collaborative Filtering基于物品的协同过滤)
给用户推荐他之前喜欢的物品相似的物品
关键:计算用户u对物品j的兴趣
学术上:

工业上:
活跃用户应该被降低在相似度公式中的贡献度

用户在不同时间对item的操作应给予时间衰减惩罚

Δt表示用户对物品u发生行为距对物品i发生行为的时间差
算法实现:
- 建立用户物品倒排表
{user:{item:score}} - 计算共现矩阵C
{item:{item:times}} - 计算余弦相似度矩阵W
[item][item] - 根据物品的相似度和用户的历史行为给用户生成推荐列表
#-*-coding:utf-8-*-
import math
class ItemBasedCF:
def __init__(self,train_file):
self.train_file = train_file
self.readData()
def readData(self):
#读取文件,并生成用户-物品的评分表和测试集
self.train = {} #用户-物品的评分表 全局变量
with open(self.train_file,'r') as f:
# 跳过第一行
next(f)
for line in f.readlines():
user,item,score,time = line.strip().split(",")
self.train.setdefault(user,{})
self.train[user][item] = int(float(score))
def ItemSimilarity(self):
#建立物品-物品的共现矩阵
C = {} #物品-物品的共现矩阵
N = {} #物品被多少个不同用户购买
for user,items in self.train.items():
for i in items.keys():
N.setdefault(i,0)
N[i] += 1
C.setdefault(i,{})
for j in items.keys():
if i == j : continue
C[i].setdefault(j,0)
C[i][j] += 1
#计算相似度矩阵
self.W = {}
for i,related_items in C.items():
self.W.setdefault(i,{})
for j,cij in related_items.items():
self.W[i][j] = cij / (math.sqrt(N[i] * N[j]))
return self.W
#给用户user推荐N个物品
def Recommend(self,user,K=3,N=10):
rank = {}
action_item = self.train[user] #用户user产生过行为的item和评分
for item,score in action_item.items():
for j,wj in sorted(self.W[item].items(),key=lambda x:x[1],reverse=True)[0:K]:
if j in action_item.keys():
continue
rank.setdefault(j,0)
rank[j] += score * wj
return dict(sorted(rank.items(),key=lambda x:x[1],reverse=True)[0:N])
# 列出用户1观看过的电影
# with open("D:/document/Python/ml-latest-small/ratings.csv", 'r') as f1:
# next(f1)
# for line in f1.readlines():
# userId,movie,rating,timestamp = line.strip().split(",")
# if (userId == '1'):
# with open("D:/document/Python/ml-latest-small/movies.csv", 'r', encoding='UTF-8') as f2:
# next(f2)
# for line2 in f2.readlines():
# item = line2.strip().split(",")
# if len(item) > 3:
# continue
# [movieId,title,genres] = item
# if movieId == movie:
# print(title,'\t',genres)
# 声明一个ItemBased推荐的对象
Item = ItemBasedCF("D:/document/Python/ml-latest-small/ratings.csv")
Item.ItemSimilarity()
recommedDic = Item.Recommend("1")
for k,v in recommedDic.items():
print(k,"\t",v)
UserCF(User-based Collaborative Filtering基于用户的协同过滤)
给用户推荐相似兴趣用户感兴趣的物品
关键:寻找相似兴趣用户的集合,找到集合用户感兴趣而目标用户没行为过的item
学术上:

工业上:
降低那些异常活跃物品对用户相似度的贡献

不同用户对同一item行为的时间段不同应该给予时间惩罚

算法实现:
- 建立物品用户倒排表
{item:{user:score}} - 计算共现矩阵C
{user:{user:times}} - 计算余弦相似度矩阵W
[user][user] - 根据物品的相似度和用户的历史行为给用户生成推荐列表
#-*-coding:utf-8-*-
import math
class UserBasedCF:
def __init__(self,train_file):
self.train_file = train_file
self.readData()
def readData(self):
#读取文件,并生成物品-用户的评分表和测试集
self.item2user = {} #物品-用户的评分表 全局变量
self.user2item = {}
with open(self.train_file,'r') as f:
# 跳过第一行
next(f)
for line in f.readlines():
user,item,score,time = line.strip().split(",")
self.item2user.setdefault(item,{})
self.item2user[item][user] = int(float(score))
self.user2item.setdefault(user,{})
self.user2item[user][item] = int(float(score))
def UserSimilarity(self):
#建立用户-用户的共现矩阵
C = {} #用户-用户的共现矩阵
N = {} #用户出现总次数
for user,items in self.item2user.items():
for i in items.keys():
N.setdefault(i,0)
N[i] += 1
C.setdefault(i,{})
for j in items.keys():
if i == j : continue
C[i].setdefault(j,0)
C[i][j] += 1
#计算相似度矩阵
self.W = {}
for i,related_users in C.items():
self.W.setdefault(i,{})
for j,cij in related_users.items():
self.W[i][j] = cij / (math.sqrt(N[i] * N[j]))
self.W[i] = dict(sorted(self.W[i].items(),key = lambda x:x[1], reverse=True))
return self.W
#给用户user推荐与其最相似的K个用户中推荐N个物品
# 从K个相似用户中找出user中不存在的item,然后打分后排序推荐给user
def Recommend(self,user,K=5,N=10):
rank = {}
related_user = list(self.W[user].keys())[0:K]
for u in related_user:
for i,w in self.user2item[u].items():
if i not in self.user2item[user].keys():
rank.setdefault(i,0)
rank[i] += self.W[user][u] * w
return(dict(sorted(rank.items(),key = lambda x:x[1],reverse=True)[0:N]))
# 列出用户user观看过的高分(5分以上)电影
def list_movies(user,rate=5):
with open("D:/document/Python/ml-latest-small/ratings.csv", 'r') as f1:
next(f1)
for line in f1.readlines():
userId,movie,rating,timestamp = line.strip().split(",")
if userId == user and float(rating) >= rate:
with open("D:/document/Python/ml-latest-small/movies.csv", 'r', encoding='UTF-8') as f2:
next(f2)
for line2 in f2.readlines():
item = line2.strip().split(",")
if len(item) > 3:
continue
[movieId,title,genres] = item
if movieId == movie:
print(rating,'\t',title,'\t\t\t',genres)
# 列出用户1评分5分以上的电影
# list_movies('1',5)
# 声明一个ItemBased推荐的对象
UserCF = UserBasedCF("D:/document/Python/ml-latest-small/ratings.csv")
UserCF.UserSimilarity()
recommedDic = UserCF.Recommend("1")
for k,v in recommedDic.items():
print(k,"\t",v)