Python 用pandas 进行数据清洗处理

1.数据读取

import pandas as pd
import numpy as np
import pymongo

data = pd.DataFrame(pd.read_excel('000.xlsx', index=False))

client = pymongo.MongoClient("mongodb://XX:XXXXX@192.168.3.7:2018",connect=False)
db = client["test"]
table = db["python"]
df = pd.DataFrame(list(table.find()))

可以从excel,csv,mongo数据之类的读取数据

2.遍历

for i in range(data.index.max()):  
    if any([  
        'missing' in data.loc[i,:].values,  
        data.loc[i,'hour'] not in range(25),  
        ]):  
  
        print('已删除存在异常值 %s 行数据'%i)  
        data.drop([i],inplace=True)
for i in range(0,len(df)):
    info = df.loc[i].to_dict()

3. 去空(NA)

 

3.1直接去除

from numpy import nan as NA
data=Series([1,NA,3.5,NA,7])
print(data.dropna())
#至少2个NA才删除
print(data.dropna(thresh=2))

3.2 用中位数或者平均数进行填充

df = df.fillna(df.median())
print(df.fillna(df.mean()))

4.对字段进行处理

 

def get_salary(salary):
    s = 0
    if "-" in salary:
        for part in salary.split("-"):
            if "万" in part:
                q = float(part[:-1]) * 10000
            else:
                q = float(part[:-1]) * 1000
            s += q
        return int(s/2.0)
    else:
        return np.nan    
df["salary"] = df["salary"].apply(get_salary)
df.head()
df["company"]=df["company"].apply(lambda x :x.split("/")[0].strip('"'))

5.删除重复

 

df["company"].drop_duplicates()

6.只留部分

df.loc[:,["address","company"]]
df_c = df_c.iloc[:,[4,5]]
del data["name_grade"]
del data["info_grade"]

7. 排序

df.sort_values(by='col1', ascending=False)

8. isin

mask = df['A'].isin([1]) #括号中必须为list

9. merge

df1 = pd.DataFrame({'name':['kate', 'herz', 'catherine', 'sally'], 'age':[25, 28, 39, 35]})
df2 = pd.DataFrame({'name_t':['kate', 'herz', 'catherine', 'sally'], 'score':[70, 60, 90, 100]})
print(pd.merge(df1, df2, left_on="name", right_on="name_t").drop('name_t', axis=1))

10.保存为csv,或者到mongo

df["company"].drop_duplicates().to_csv("company.csv",encoding="utf-8", index=False)
db[MONGO_TABLE].insert(row.to_dict()) 

11.使用value_count()来得到每个出现的次数

x = df.score.value_counts().index,
y = df.score.value_counts().values,

12. reset_index

df = pd.read_csv('2318.HK.csv')
df = df.loc[df['Date'] >= '2018-01-01']
df = df.reset_index(drop=True)
df

12. crosstab 看交叉

pd.crosstab(df.age, df.NumberOfDependents)

13.更改列的次序

order = ['name','type','value','date']
df = df[order]

14.更改columns

df.rename(columns={'rank':'value', 'rank_date':'date', 'site_name':'name'}, inplace = True)

15.显示多一些行和列

pd.set_option('max_columns',1000) 
pd.set_option('max_row',300) 

16.首字母大写

df['site_name'] = df['site_name'].str.capitalize()

17.更改index

DataFrame.index = [newName]

http://www.waitingfy.com/archives/3603

 

3603

Leave a Reply

Name and Email Address are required fields.
Your email will not be published or shared with third parties.