目录

基于内容的推荐

基于内容的推荐 Content Based

特点:

  1. 思路简单,可解释性强
  2. 用户推荐具有独立性,不受其他用户的影响

缺点:

  1. 广度较窄,过多地关注用户喜好的物品
  2. 要积累一定量的用户行为

步骤:

  1. 对物品进行刻画 item profile
    1. topic finding
    2. genre classify
  2. 对用户进行刻画 user profile
    1. genre/topic
    2. time decay
  3. 在线推荐
    1. find top k genre/topic
    2. get the best n item from fixed genre/topic
import os

# 获得物品平均评分
def get_ave_score(input_file):
    if not os.path.exists(input_file):
        return {}
    record = {}
    ave_score = {}
    with open(input_file,'r') as f:
        next(f)
        for line in f:
            item = line.strip().split(",")
            if len(item) < 4:
                continue
            userid, itemid, rating = item[0], item[1], float(item[2])
            if itemid not in record:
                record[itemid] = [0, 0] #[总评分,评分人数]
            record[itemid][0] += rating
            record[itemid][1] += 1
    for itemid in record:
        ave_score[itemid] = round(record[itemid][0]/record[itemid][1],3)
    return ave_score

# 获得物品类别,并每一类别中的所有物品进行倒排序
def get_item_cate(ave_score, input_file):
    if not os.path.exists(input_file):
        return {},{}
    topK = 100
    # 每个物品的种类
    item_cate = {}
    # 每个种类下物品的平均分
    record = {}
    # 倒排
    cate_item_sort = {}
    with open(input_file, "r", encoding="utf-8") as f:
        next(f)
        for line in f:
            item = line.strip().split(",")
            if len(item) < 3:
                continue
            itemid = item[0]
            cate_str = item[-1]
            cate_list = cate_str.strip().split("|")
            # 因每个物品有多种类别,所以每种类别的比重均分
            ratio = round(1/len(cate_list),3) 
            if itemid not in item_cate:
                item_cate[itemid] = {}
            for fix_cate in cate_list:
                item_cate[itemid][fix_cate] = ratio
        for itemid in item_cate:
            for cate in item_cate[itemid]:
                if cate not in record:
                    record[cate] = {}
                itemid_rating_score = ave_score.get(itemid, 0)
                record[cate][itemid] = itemid_rating_score
        for cate in record:
            if cate not in cate_item_sort:
                cate_item_sort[cate] = []
            for top_item in sorted(record[cate].items(), key = lambda x:x[1], reverse=True)[:topK]:
                cate_item_sort[cate].append(top_item[0])
        return item_cate, cate_item_sort


def get_up(item_cate, input_file):
    if not os.path.exists(input_file):
        return {}
    score_thr = 4.0
    topK = 2
    record = {}
    # 输出
    up = {}
    with open(input_file, "r") as f:
        next(f)
        for line in f:
            item = line.strip().split(",")
            if len(item) < 4:
                continue
            userid, itemid, rating, timestamp = item[0], item[1], float(item[3]), int(item[3])
            if rating <= score_thr:
                continue
            if itemid not in item_cate:
                continue
            time_score = get_time_score(timestamp)
            if userid not in record:
                record[userid] = {}
            for fix_cate in item_cate[itemid]:
                if fix_cate not in record[userid]:
                    record[userid][fix_cate] = 0
                record[userid][fix_cate] += rating * time_score * item_cate[itemid][fix_cate]
    for userid in record:
        if userid not in up:
            up[userid] = []
        total_score  = 0
        for item in sorted(record[userid].items(), key = lambda x : x[1], reverse=True)[:topK]:
            up[userid].append((item[0],item[1]))
            total_score += item[1]
        for index in range(len(up[userid])):
            up[userid][index] = (up[userid][index][0], round(up[userid][index][1]/total_score, 3))
    return up

def get_time_score(timestamp):
    fix_time_stamp = 1494273047
    # total_sec = 24*60*60
    delta = (fix_time_stamp - timestamp) / fix_time_stamp
    return round(1/(1+delta), 3)


def recom(cate_item_sort, up, userid, topK = 10):
    if userid not in up:
        return {}
    recom_result = {}
    if userid not in recom_result:
        recom_result[userid] = []
    for itera in up[userid]:
        cate = itera[0]
        ratio = itera[1]
        num = int(topK*ratio) + 1
        if cate not in cate_item_sort:
            continue
        recom_list = cate_item_sort[cate][:num]
        recom_result[userid] += recom_list
    return recom_result


if __name__ == "__main__":
    # 测试
    # ave_score = get_ave_score("D:/document/Python/ml-latest-small/ratings.csv")
    # print(len(ave_score))
    # print(ave_score['100'])
    # item_cate, cate_item_sort = get_item_cate(ave_score, "D:/document/Python/ml-latest-small/movies.csv")
    # print(item_cate["1"])
    # print(cate_item_sort["Children"])

    ave_score = get_ave_score("D:/document/Python/ml-latest-small/ratings.csv")
    item_cate, cate_item_sort = get_item_cate(ave_score, "D:/document/Python/ml-latest-small/movies.csv")
    up = get_up(item_cate, "D:/document/Python/ml-latest-small/ratings.csv")
    print(len(up))
    print(up["1"])  # 用户1的刻画
    print(recom(cate_item_sort, up, "1"))         # 用户1的推荐结果