Contents
1.数据读取
import pandas as pd import numpy as np import pymongo data = pd.DataFrame(pd.read_excel('000.xlsx', index=False)) client = pymongo.MongoClient("mongodb://XX:XXXXX@192.168.3.7:2018",connect=False) db = client["test"] table = db["python"] df = pd.DataFrame(list(table.find()))
可以从excel,csv,mongo数据之类的读取数据
2.遍历
for i in range(data.index.max()): if any([ 'missing' in data.loc[i,:].values, data.loc[i,'hour'] not in range(25), ]): print('已删除存在异常值 %s 行数据'%i) data.drop([i],inplace=True) for i in range(0,len(df)): info = df.loc[i].to_dict()
3. 去空(NA)
3.1直接去除
from numpy import nan as NA data=Series([1,NA,3.5,NA,7]) print(data.dropna()) #至少2个NA才删除 print(data.dropna(thresh=2))
3.2 用中位数或者平均数进行填充
df = df.fillna(df.median()) print(df.fillna(df.mean()))
4.对字段进行处理
def get_salary(salary): s = 0 if "-" in salary: for part in salary.split("-"): if "万" in part: q = float(part[:-1]) * 10000 else: q = float(part[:-1]) * 1000 s += q return int(s/2.0) else: return np.nan df["salary"] = df["salary"].apply(get_salary) df.head() df["company"]=df["company"].apply(lambda x :x.split("/")[0].strip('"'))
5.删除重复
df["company"].drop_duplicates()
6.只留部分
df.loc[:,["address","company"]] df_c = df_c.iloc[:,[4,5]] del data["name_grade"] del data["info_grade"]
7. 排序
df.sort_values(by='col1', ascending=False)
8. isin
mask = df['A'].isin([1]) #括号中必须为list
9. merge
df1 = pd.DataFrame({'name':['kate', 'herz', 'catherine', 'sally'], 'age':[25, 28, 39, 35]}) df2 = pd.DataFrame({'name_t':['kate', 'herz', 'catherine', 'sally'], 'score':[70, 60, 90, 100]}) print(pd.merge(df1, df2, left_on="name", right_on="name_t").drop('name_t', axis=1))
10.保存为csv,或者到mongo
df["company"].drop_duplicates().to_csv("company.csv",encoding="utf-8", index=False) db[MONGO_TABLE].insert(row.to_dict())
11.使用value_count()来得到每个出现的次数
x = df.score.value_counts().index, y = df.score.value_counts().values,
12. reset_index
df = pd.read_csv('2318.HK.csv') df = df.loc[df['Date'] >= '2018-01-01'] df = df.reset_index(drop=True) df
12. crosstab 看交叉
pd.crosstab(df.age, df.NumberOfDependents)
13.更改列的次序
order = ['name','type','value','date'] df = df[order]
14.更改columns
df.rename(columns={'rank':'value', 'rank_date':'date', 'site_name':'name'}, inplace = True)
15.显示多一些行和列
pd.set_option('max_columns',1000) pd.set_option('max_row',300)
16.首字母大写
df['site_name'] = df['site_name'].str.capitalize()
17.更改index
DataFrame.index = [newName]
http://www.waitingfy.com/archives/3603
3603