1. 程式人生 > >資料科學和人工智慧技術筆記 十九、資料整理(上)

資料科學和人工智慧技術筆記 十九、資料整理(上)

十九、資料整理(上)

作者:Chris Albon

譯者:飛龍

協議:CC BY-NC-SA 4.0

在 Pandas 中通過分組應用函式

import pandas as pd

# 建立示例資料幀
data = {'Platoon': ['A','A','A','A','A','A','B','B','B','B','B','C','C','C','C','C'],
       'Casualties': [1,4,5,7,5,5,6,1,4,5,6,7,4,6,4,6]}
df = pd.DataFrame(data)
df
Casualties Platoon
0 1 A
1 4 A
2 5 A
3 7 A
4 5 A
5 5 A
6 6 B
7 1 B
8 4 B
9 5 B
10 6 B
11 7 C
12 4 C
13 6 C
14 4 C
15 6 C
# 按照 df.platoon 對 df 分組
# 然後將滾動平均 lambda 函式應用於 df.casualties
df.groupby('Platoon')['Casualties'].apply(lambda x:x.rolling(center=False,window=2).mean()) ''' 0 NaN 1 2.5 2 4.5 3 6.0 4 6.0 5 5.0 6 NaN 7 3.5 8 2.5 9 4.5 10 5.5 11 NaN 12 5.5 13 5.0 14 5.0 15 5.0 dtype: float64 '''

在 Pandas 中向分組應用操作

# 匯入模組
import pandas as pd

# 建立資料幀
raw_data = {'regiment': ['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', 'Dragoons', 'Dragoons', 'Dragoons', 'Dragoons', 'Scouts', 'Scouts', 'Scouts', 'Scouts'], 
        'company': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', '2nd','1st', '1st', '2nd', '2nd'], 
        'name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze', 'Jacon', 'Ryaner', 'Sone', 'Sloan', 'Piger', 'Riani', 'Ali'], 
        'preTestScore': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3],
        'postTestScore': [25, 94, 57, 62, 70, 25, 94, 57, 62, 70, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['regiment', 'company', 'name', 'preTestScore', 'postTestScore'])
df
regiment company name preTestScore postTestScore
0 Nighthawks 1st Miller 4 25
1 Nighthawks 1st Jacobson 24 94
2 Nighthawks 2nd Ali 31 57
3 Nighthawks 2nd Milner 2 62
4 Dragoons 1st Cooze 3 70
5 Dragoons 1st Jacon 4 25
6 Dragoons 2nd Ryaner 24 94
7 Dragoons 2nd Sone 31 57
8 Scouts 1st Sloan 2 62
9 Scouts 1st Piger 3 70
10 Scouts 2nd Riani 2 62
11 Scouts 2nd Ali 3 70
# 建立一個 groupby 變數,按團隊(regiment)對 preTestScores 分組
groupby_regiment = df['preTestScore'].groupby(df['regiment'])
groupby_regiment

# <pandas.core.groupby.SeriesGroupBy object at 0x113ddb550> 

“這個分組變數現在是GroupBy物件。 除了分組的鍵df ['key1']的一些中間資料之外,它實際上還沒有計算任何東西。 我們的想法是,該物件具有將所有操作應用於每個分組所需的所有資訊。” – PyDA

使用list()顯示分組的樣子。

list(df['preTestScore'].groupby(df['regiment']))

'''
[('Dragoons', 4     3
  5     4
  6    24
  7    31
  Name: preTestScore, dtype: int64), ('Nighthawks', 0     4
  1    24
  2    31
  3     2
  Name: preTestScore, dtype: int64), ('Scouts', 8     2
  9     3
  10    2
  11    3
  Name: preTestScore, dtype: int64)] 
'''

df['preTestScore'].groupby(df['regiment']).describe()
count mean std min 25% 50% 75% max
regiment
Dragoons 4.0 15.50 14.153916 3.0 3.75 14.0 25.75 31.0
Nighthawks 4.0 15.25 14.453950 2.0 3.50 14.0 25.75 31.0
Scouts 4.0 2.50 0.577350 2.0 2.00 2.5 3.00 3.0
# 每個團隊的 preTestScore 均值
groupby_regiment.mean()

'''
regiment
Dragoons      15.50
Nighthawks    15.25
Scouts         2.50
Name: preTestScore, dtype: float64 
'''

df['preTestScore'].groupby([df['regiment'], df['company']]).mean()

'''
regiment    company
Dragoons    1st         3.5
            2nd        27.5
Nighthawks  1st        14.0
            2nd        16.5
Scouts      1st         2.5
            2nd         2.5
Name: preTestScore, dtype: float64 
'''

df['preTestScore'].groupby([df['regiment'], df['company']]).mean().unstack()
company 1st 2nd
regiment
Dragoons 3.5 27.5
Nighthawks 14.0 16.5
Scouts 2.5 2.5
# 按團隊和公司(company)對整個資料幀分組
df.groupby(['regiment', 'company']).mean()
preTestScore postTestScore
regiment company
Dragoons 1st 3.5 47.5
2nd 27.5 75.5
Nighthawks 1st 14.0 59.5
2nd 16.5 59.5
Scouts 1st 2.5 66.0
2nd 2.5 66.0
# 每個團隊和公司的觀測數量
df.groupby(['regiment', 'company']).size()

'''
regiment    company
Dragoons    1st        2
            2nd        2
Nighthawks  1st        2
            2nd        2
Scouts      1st        2
            2nd        2
dtype: int64 
'''

# 按團隊對資料幀分組,對於每個團隊,
for name, group in df.groupby('regiment'): 
    # 列印團隊名稱
    print(name)
    # 列印它的資料
    print(group)


'''
Dragoons
   regiment company    name  preTestScore  postTestScore
4  Dragoons     1st   Cooze             3             70
5  Dragoons     1st   Jacon             4             25
6  Dragoons     2nd  Ryaner            24             94
7  Dragoons     2nd    Sone            31             57
Nighthawks
     regiment company      name  preTestScore  postTestScore
0  Nighthawks     1st    Miller             4             25
1  Nighthawks     1st  Jacobson            24             94
2  Nighthawks     2nd       Ali            31             57
3  Nighthawks     2nd    Milner             2             62
Scouts
   regiment company   name  preTestScore  postTestScore
8    Scouts     1st  Sloan             2             62
9    Scouts     1st  Piger             3             70
10   Scouts     2nd  Riani             2             62
11   Scouts     2nd    Ali             3             70 
'''

按列分組:

特別是在這種情況下:按列對資料型別(即axis = 1)分組,然後使用list()檢視該分組的外觀。

list(df.groupby(df.dtypes, axis=1))

'''
[(dtype('int64'),     preTestScore  postTestScore
  0              4             25
  1             24             94
  2             31             57
  3              2             62
  4              3             70
  5              4             25
  6             24             94
  7             31             57
  8              2             62
  9              3             70
  10             2             62
  11             3             70),
 (dtype('O'),       regiment company      name
  0   Nighthawks     1st    Miller
  1   Nighthawks     1st  Jacobson
  2   Nighthawks     2nd       Ali
  3   Nighthawks     2nd    Milner
  4     Dragoons     1st     Cooze
  5     Dragoons     1st     Jacon
  6     Dragoons     2nd    Ryaner
  7     Dragoons     2nd      Sone
  8       Scouts     1st     Sloan
  9       Scouts     1st     Piger
  10      Scouts     2nd     Riani
  11      Scouts     2nd       Ali)] 

df.groupby('regiment').mean().add_prefix('mean_')
mean_preTestScore mean_postTestScore
regiment
Dragoons 15.50 61.5
Nighthawks 15.25 59.5
Scouts 2.50 66.0
# 建立獲取分組狀態的函式
def get_stats(group):
    return {'min': group.min(), 'max': group.max(), 'count': group.count(), 'mean': group.mean()}

bins = [0, 25, 50, 75, 100]
group_names = ['Low', 'Okay', 'Good', 'Great']
df['categories'] = pd.cut(df['postTestScore'], bins, labels=group_names)

df['postTestScore'].groupby(df['categories']).apply(get_stats).unstack()
count max mean min
categories
Good 8.0 70.0 63.75 57.0
Great 2.0 94.0 94.00 94.0
Low 2.0 25.0 25.00 25.0
Okay 0.0 NaN NaN NaN

在 Pandas 資料幀上應用操作

# 匯入模型
import pandas as pd
import numpy as np

data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
        'year': [2012, 2012, 2013, 2014, 2014], 
        'reports': [4, 24, 31, 2, 3],
        'coverage': [25, 94, 57, 62, 70]}
df = pd.DataFrame(data, index = ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'])
df
coverage name reports year
Cochice 25 Jason 4 2012
Pima 94 Molly 24 2012
Santa Cruz 57 Tina 31 2013
Maricopa 62 Jake 2 2014
Yuma 70 Amy 3 2014
# 建立大寫轉換的 lambda 函式
capitalizer = lambda x: x.upper()

capitalizer函式應用於name列。

apply()可以沿資料幀的任意軸應用函式。

df['name'].apply(capitalizer)

'''
Cochice       JASON
Pima          MOLLY
Santa Cruz     TINA
Maricopa       JAKE
Yuma            AMY
Name: name, dtype: object 
'''

capitalizer lambda 函式對映到序列name中的每個元素。

map()對序列的每個元素應用操作。

df['name'].map(capitalizer)

'''
Cochice       JASON
Pima          MOLLY
Santa Cruz     TINA
Maricopa       JAKE
Yuma            AMY
Name: name, dtype: object 
'''

將平方根函式應用於整個資料幀中的每個單元格。

applymap()將函式應用於整個資料幀中的每個元素。

# 刪除字串變數,以便 applymap() 可以執行
df = df.drop('name', axis=1)

# 返回資料幀每個單元格的平方根
df.applymap(np.sqrt)
coverage reports year
Cochice 5.000000 2.000000 44.855323
Pima 9.695360 4.898979 44.855323
Santa Cruz 7.549834 5.567764 44.866469
Maricopa 7.874008 1.414214 44.877611
Yuma 8.366600 1.732051 44.877611

在資料幀上應用函式。

# 建立叫做 times100 的函式
def times100(x):
    # 如果 x 是字串,
    if type(x) is str:
        # 原樣返回它
        return x
    # 如果不是,返回它乘上 100
    elif x:
        return 100 * x
    # 並留下其它東西
    else:
        return

df.applymap(times100)
coverage reports year
Cochice 2500 400 201200
Pima 9400 2400 201200
Santa Cruz 5700 3100 201300
Maricopa 6200 200 201400
Yuma 7000 300 201400

向 Pandas 資料幀賦予新列

import pandas as pd

# 建立空資料幀
df = pd.DataFrame()

# 建立一列
df['name'] = ['John', 'Steve', 'Sarah']

# 檢視資料幀
df
name
0 John
1 Steve
2 Sarah
# 將一個新列賦予名為 age 的 df,它包含年齡列表
df.assign(age = [31, 32, 19])
name age
0 John 31
1 Steve 32
2 Sarah 19

將列表拆分為大小為 N 的分塊

在這個片段中,我們接受一個列表並將其分解為大小為 n 的塊。 在處理具有最大請求大小的 API 時,這是一種非常常見的做法。

這個漂亮的函式由 Ned Batchelder 貢獻,釋出於 StackOverflow

# 建立名稱列表
first_names = ['Steve', 'Jane', 'Sara', 'Mary','Jack','Bob', 'Bily', 'Boni', 'Chris','Sori', 'Will', 'Won','Li']

# 建立叫做 chunks 的函式,有兩個引數 l 和 n
def chunks(l, n):
    # 對於長度為 l 的範圍中的專案 i
    for i in range(0, len(l), n):
        # 建立索引範圍
        yield l[i:i+n]

# 從函式 chunks 的結果建立一個列表
list(chunks(first_names, 5))

'''
[['Steve', 'Jane', 'Sara', 'Mary', 'Jack'],
 ['Bob', 'Bily', 'Boni', 'Chris', 'Sori'],
 ['Will', 'Won', 'Li']] 
'''

在 Pandas 中使用正則表示式將字串分解為列

# 匯入模組
import re
import pandas as pd

# 建立帶有一列字串的資料幀
data = {'raw': ['Arizona 1 2014-12-23       3242.0',
                'Iowa 1 2010-02-23       3453.7',
                'Oregon 0 2014-06-20       2123.0',
                'Maryland 0 2014-03-14       1123.6',
                'Florida 1 2013-01-15       2134.0',
                'Georgia 0 2012-07-14       2345.6']}
df = pd.DataFrame(data, columns = ['raw'])
df
raw
0 Arizona 1 2014-12-23 3242.0
1 Iowa 1 2010-02-23 3453.7
2 Oregon 0 2014-06-20 2123.0
3 Maryland 0 2014-03-14 1123.6
4 Florida 1 2013-01-15 2134.0
5 Georgia 0 2012-07-14 2345.6
# df['raw'] 的哪些行包含 'xxxx-xx-xx'?
df['raw'].str.contains('....-..-..', regex=True)

'''
0    True
1    True
2    True
3    True
4    True
5    True
Name: raw, dtype: bool 
'''

# 在 raw 列中,提取字串中的單個數字
df['female'] = df['raw'].str.extract('(\d)', expand=True)
df['female']

'''
0    1
1    1
2    0
3    0
4    1
5    0
Name: female, dtype: object 
'''

# 在 raw 列中,提取字串中的 xxxx-xx-xx
df['date'] = df['raw'].str.extract('(....-..-..)', expand=True)
df['date']

'''
0    2014-12-23
1    2010-02-23
2    2014-06-20
3    2014-03-14
4    2013-01-15
5    2012-07-14
Name: date, dtype: object 
'''

# 在 raw 列中,提取字串中的 ####.##
df['score'] = df['raw'].str.extract('(\d\d\d\d\.\d)', expand=True)
df['score']

'''
0    3242.0
1    3453.7
2    2123.0
3    1123.6
4    2134.0
5    2345.6
Name: score, dtype: object 
'''

# 在 raw 列中,提取字串中的單詞
df['state'] = df['raw'].str.extract('([A-Z]\w{0,})', expand=True)
df['state']

'''
0     Arizona
1        Iowa
2      Oregon
3    Maryland
4     Florida
5     Georgia
Name: state, dtype: object 
'''

df
raw female date score state
0 Arizona 1 2014-12-23 3242.0 1 2014-12-23 3242.0 Arizona
1 Iowa 1 2010-02-23 3453.7 1 2010-02-23 3453.7 Iowa
2 Oregon 0 2014-06-20 2123.0 0 2014-06-20 2123.0 Oregon
3 Maryland 0 2014-03-14 1123.6 0 2014-03-14 1123.6 Maryland
4 Florida 1 2013-01-15 2134.0 1 2013-01-15 2134.0 Florida
5 Georgia 0 2012-07-14 2345.6 0 2012-07-14 2345.6 Georgia

由兩個資料幀貢獻列

# 匯入庫
import pandas as pd

# 建立資料幀
dataframe_one = pd.DataFrame()
dataframe_one['1'] = ['1', '1', '1']
dataframe_one['B'] = ['b', 'b', 'b']

# 建立第二個資料幀
dataframe_two = pd.DataFrame()
dataframe_two['2'] = ['2', '2', '2']
dataframe_two['B'] = ['b', 'b', 'b']

# 將每個資料幀的列轉換為集合,
# 然後找到這兩個集合的交集。
# 這將是兩個資料幀共享的列的集合。
set.intersection(set(dataframe_one), set(dataframe_two))

# {'B'} 

從多個列表構建字典

# 建立官員名稱的列表
officer_names = ['Sodoni Dogla', 'Chris Jefferson', 'Jessica Billars', 'Michael Mulligan', 'Steven Johnson']

# 建立官員軍隊的列表
officer_armies = ['Purple Army', 'Orange Army', 'Green Army', 'Red Army', 'Blue Army']

# 建立字典,它是兩個列表的 zip
dict(zip(officer_names, officer_armies))

'''
{'Chris Jefferson': 'Orange Army',
 'Jessica Billars': 'Green Army',
 'Michael Mulligan': 'Red Army',
 'Sodoni Dogla': 'Purple Army',
 'Steven Johnson': 'Blue Army'} 
'''

將 CSV 轉換為 Python 程式碼來重建它

# 匯入 pandas 包
import pandas as pd

# 將 csv 檔案載入為資料幀
df_original = pd.read_csv('http://vincentarelbundock.github.io/Rdatasets/csv/datasets/iris.csv')
df = pd.read_csv('http://vincentarelbundock.github.io/Rdatasets/csv/datasets/iris.csv')

# 列印建立資料幀的程式碼
print('==============================')
print('RUN THE CODE BELOW THIS LINE')
print('==============================')
print('raw_data =', df.to_dict(orient='list'))
print('df = pd.DataFrame(raw_da