import pandas as pd import matplotlib.pyplot as plt import numpy as np import json from pandas import DataFrame, Series unames = ['user_id', 'gender', 'age', 'occupation', 'zip'] #用read_table方式读取数据,给出分隔和names users = pd.read_table('ch02/movielens/users.dat', sep='::', header=None, names=unames) rnames = ['user_id', 'movie_id', 'rating', 'timestamp'] ratings = pd.read_table('ch02/movielens/ratings.dat', sep='::', header=None, names=rnames) mnames = ['movie_id', 'title', 'genres'] movies = pd.read_table('ch02/movielens/movies.dat', sep='::', header=None, names=mnames) #合并 data = pd.merge(pd.merge(ratings, users), movies) #透视表 mean_ratings = data.pivot_table('rating', index='title', columns='gender', aggfunc='mean') ratings_by_title = data.groupby('title').size() #得到活跃的电影 active_titles = ratings_by_title.index[ratings_by_title >= 250] mean_ratings = mean_ratings.loc[active_titles] #女性评分最高排序 top_female_ratings = mean_ratings.sort_index(by='F', ascending=False) mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F'] #按照差异来排序 sorted_by_diff = mean_ratings.sort_index(by='diff')3967
06
6月
python pandas 实战 电影评分处理
