Python資料分析學習總結

阿新 • • 發佈：2018-12-22

Python資料分析基礎

numpy

開源、資料計算擴充套件；ndarray、多維操作、線性代數

numpy使用程式：

import numpy as np

def main():
    lst=[[1,3,5],[2,4,6]]
    print(type(lst))
    np_lst=np.array(lst)
    print(type(np_lst))
    np_lst=np.array(lst, dtype=np.float)
    print(np_lst.shape)
    print(np_lst.ndim)
    print(np_lst.dtype)
    print(np_lst.itemsize)
    print(np_lst.size)

if __name__=="__main__":
    main()
執行結果：
<class 'list'>
<class 'numpy.ndarray'>
(2, 3)
2
float64
8
6

numpy常用陣列

print(np.zeros([2,4]))
print(np.ones([3,5]))
print(np.random.rand(2,4))
print(np.random.rand())
print("RandInt:")
print(np.random.randint(1,10,3))
print("Randn:")  # 標準正態分佈
print(np.random.randn(2,4)) 
print("Choice")
print(np.random.choice([10,20,30]))
print("Distribute:")  # Beta分佈
print(np.random.beta(1,10,100))

執行結果：
[[ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]]
[[ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]]
[[ 0.80307088  0.25491367  0.54381007  0.10159737]
 [ 0.71565024  0.62473538  0.66892166  0.41078071]]
0.16467244260637237
RandInt:
[5 3 2]
Randn:
[[-0.51707383 -1.46091351 -0.78197086  0.44640286]
 [-0.0998081   0.40701679  0.07750661  0.66041753]]
Choice
10
Distribute:
[ 0.03897375  0.09804991  0.1617222  ...,  0.12878516  0.11699157
  0.05681225]

numpy常用操作

print("Arange:")
print(np.arange(1,11))
print("Exp:")
print(np.exp(lst))
print("Exp2:")
print(np.exp2(lst))
print("Sqrt:")
print(np.sqrt(lst))
print("Sin:")
print(np.sin(lst))
print("Log:")
print(np.log(lst))
執行結果：
Arange:
[ 1  2  3  4  5  6  7  8  9 10]
Exp:
[[   2.71828183   20.08553692  148.4131591 ]
 [   7.3890561    54.59815003  403.42879349]]
Exp2:
[[  2.   8.  32.]
 [  4.  16.  64.]]
Sqrt:
[[ 1.          1.73205081  2.23606798]
 [ 1.41421356  2.          2.44948974]]
Sin:
[[ 0.84147098  0.14112001 -0.95892427]
 [ 0.90929743 -0.7568025  -0.2794155 ]]
Log:
[[ 0.          1.09861229  1.60943791]
 [ 0.69314718  1.38629436  1.79175947]]

lst=np.array([[[1,2,3,4],[4,5,6,7]],[[7,8,9,10],[10,11,12,13]],[[14,15,16,17],[18,19,20,11]]])
print(lst.sum(axis=2))
print(lst.sum(axis=1))
print(lst.sum(axis=0))
print("Max:")
print(lst.max(axis=1))
print("Min:")
print(lst.min(axis=0))  
執行結果：
[[10 22]
 [34 46]
 [62 68]]
[[ 5  7  9 11]
 [17 19 21 23]
 [32 34 36 28]]
[[22 25 28 31]
 [32 35 38 31]]
Max:
[[ 4  5  6  7]
 [10 11 12 13]
 [18 19 20 17]]
Min:
[[1 2 3 4]
 [4 5 6 7]]

lst1=np.array([10,20,30,40])
lst2=np.array([4,3,2,1])
print("Add:")
print(lst1+lst2)
print("Sub:")
print(lst1-lst2)
print("Mul:")
print(lst1*lst2)
print("Div:")
print(lst1/lst2)
print("Square:")
print(lst1**2)
print("Dot:")
print(np.dot(lst1.reshape([2,2]),lst2.reshape([2,2])))
print("Concatenate:")
print(np.concatenate((lst1,lst2),axis=0))
print("vstack:")
print(np.vstack((lst1,lst2)))
print("hstack:")
print(np.hstack((lst1,lst2)))
print("Split:")
print(np.split(lst1,2))
print(np.split(lst1,4))
print("Copy:")
print(np.copy(lst1))
執行結果：
Add:
[14 23 32 41]
Sub:
[ 6 17 28 39]
Mul:
[40 60 60 40]
Div:
[  2.5          6.66666667  15.          40.        ]
Square:
[ 100  400  900 1600]
Dot:
[[ 80  50]
 [200 130]]
Concatenate:
[10 20 30 40  4  3  2  1]
vstack:
[[10 20 30 40]
 [ 4  3  2  1]]
hstack:
[10 20 30 40  4  3  2  1]
Split:
[array([10, 20]), array([30, 40])]
[array([10]), array([20]), array([30]), array([40])]
Copy:
[10 20 30 40]

執行緒方程組

import numpy as np
from numpy.linalg import *

def main():

    print(np.eye(3))
    lst=np.array([[1,2],[3,4]])
    print("Inv:")
    print(inv(lst))
    print("T:")
    print(lst.transpose())
    print("Det:")
    print(det(lst))
    print("Eig:")
    print(eig(lst))

if __name__=="__main__":
    main()

執行結果：
[[ 1.  0.  0.]
 [ 0.  1.  0.]
 [ 0.  0.  1.]]
Inv:
[[-2.   1. ]
 [ 1.5 -0.5]]
T:
[[1 3]
 [2 4]]
Det:
-2.0
Eig:
(array([-0.37228132,  5.37228132]), array([[-0.82456484, -0.41597356],
       [ 0.56576746, -0.90937671]]))

numpy其他方面應用

import numpy as np
from numpy.linalg import *

def main():

    print("FFT:")
    print(np.fft.fft(np.array([1,1,1,1,1,1,1,1])))
    print("Coef:")
    print(np.corrcoef([1,0,1],[0,2,1]))
    print("Poly:")
    print(np.poly1d([2,1,3]))

if __name__=="__main__":
    main()

執行結果：
FFT:
[ 8.+0.j  0.+0.j  0.+0.j  0.+0.j  0.+0.j  0.+0.j  0.+0.j  0.+0.j]
Coef:
[[ 1.        -0.8660254]
 [-0.8660254  1.       ]]
Poly:
   2
2 x + 1 x + 3

matplotlib

概述

matplotlib是關鍵的繪相簿。

實現

import numpy as np
import matplotlib.pyplot as plt

def main():
    #line
    x=np.linspace(-np.pi,np.pi,256,endpoint=True)
    c,s=np.cos(x),np.sin(x)
    plt.figure(1)
    plt.plot(x,c,color="blue",linewidth=1.0,linestyle="-",label="COS",alpha=0.5)
    plt.plot(x,s,"r*",label="SIN")
    plt.title("COS & SIN")
    ax=plt.gca()
    ax.spines["right"].set_color("none")
    ax.spines["top"].set_color("none")
    ax.spines["left"].set_position(("data",0))
    ax.spines["bottom"].set_position(("data",0))
    ax.xaxis.set_ticks_position("bottom")
    ax.yaxis.set_ticks_position("left")
    plt.show()

    #scatter
    fig=plt.figure()
    ax=fig.add_subplot(3,3,1)
    n=128
    X=np.random.normal(0,1,n)
    Y=np.random.normal(0,1,n)
    T=np.arctan2(Y,X)
    #plt.axes([0.025,0.025,0.95,0.95])
    #plt.scatter(X,Y,s=75,c=T,alpha=0.5)
    ax.scatter(X,Y,s=75,c=T,alpha=0.5)
    plt.xlim(-1.5,1.5),plt.xticks([])
    plt.ylim(-1.5,1.5),plt.yticks([])
    plt.axis()
    plt.title("scatter")
    plt.xlabel("x")
    plt.ylabel("y") 
    plt.show()

    #bar
    fig.add_subplot(332)
    n=10
    X=np.arange(n)
    Y1=(1-X/float(n))*np.random.uniform(0.5,1.0,n)
    Y2=(1-X/float(n))*np.random.uniform(0.5,1.0,n)
    plt.bar(X,+Y1,facecolor='#9999ff',edgecolor='white')
    plt.bar(X,-Y2,facecolor='#9999ff',edgecolor='white')
    for x,y in zip(X,Y1):
        plt.text(x+0.4,y+0.05,'%.2f' % y,ha='center',va='bottom')
    for x,y in zip(X,Y2):
        plt.text(x+0.4,-y-0.05,'%.2f' % y,ha='center',va='bottom')       
    plt.show()

    #Pie
    fig.add_subplot(333)
    n=20
    Z=np.ones(n)
    Z[-1]*=2
    plt.pie(Z,explode=Z*.05,colors=['%s' % (i / float(n)) for i in range(n)],
            labels=['%.2f' % (i / float(n)) for i in range(n)])
    plt.gca().set_aspect('equal')
    plt.xticks([]), plt.yticks([])
    plt.show()

    #polar
    fig.add_subplot(334)
    n=20
    theta=np.arange(0.0,2*np.pi,2*np.pi/n)
    radii=10*np.random.rand(n)
    plt.plot(theta, radii)
    plt.show() 

    #beatmap
    fig.add_subplot(335)
    from matplotlib import cm
    data=np.random.rand(3,3)
    cmap=cm.Blues
    map=plt.imshow(data,interpolation='nearest',cmap=cmap,aspect='auto',vmin=0,vmax=1)
    plt.show()

    #hot map
    fig.add_subplot(313)
    def f(x,y):
        return (1-x/2+x**5+y**3)*np.exp(-x**2-y**2)
    n=256
    x=np.linspace(-3,3,n)
    y=np.linspace(-3,3,n)
    X,Y=np.meshgrid(x,y)
    plt.contourf(X,Y,f(X,Y),8,alpha=.75,cmap=plt.cm.hot)
    plt.show()

    #3D
    ax=fig.add_subplot(336,projection="3d")
    ax.scatter(1,1,3,s=100)
    plt.show()

if __name__=="__main__":
    main()

scipy

簡介

數值計算庫

積分

程式：
import numpy as np
from scipy.integrate import quad,dblquad,nquad

def main():
    # Integral
    print(quad(lambda x:np.exp(-x),0,np.inf))
    print(dblquad(lambda t,x:np.exp(-x*t)/t**3,0,np.inf,lambda x:1,lambda x:np.inf))
    def f(x,y):
        return x*y
    def bound_y():
        return [0,0.5]
    def bound_x(y):
        return [0,1-2*y]
    print(nquad(f,[bound_x,bound_y]))

if __name__=="__main__":
    main()

執行結果：
(1.0000000000000002, 5.842607038578007e-11)
(0.3333333333366853, 1.3888461883425516e-08)
(0.010416666666666668, 4.101620128472366e-16)

優化器

import numpy as np
from scipy.optimize import minimize

def main():
    # Optimizer
    def rosen(x):
        return sum(100.0*(x[1:]-x[:-1]**2.0)**2.0+(1-x[:-1])**2.0)
    x0=np.array([1.3,0.7,0.8,1.9,1.2])
    res=minimize(rosen,x0,method="nelder-mead",options={"xtol":1e-8,"disp":True})
    print("ROSE MINI:", res)

if __name__=="__main__":
    main()

執行結果：
Optimization terminated successfully.
             Current function value: 0.000000
         Iterations: 339
         Function evaluations: 571
ROSE MINI:  final_simplex: (array([[ 1.        ,  1.        ,  1.        ,  1.        ,  1.        ],
       [ 1.        ,  1.        ,  1.        ,  1.        ,  1.        ],
       [ 1.        ,  1.        ,  1.        ,  1.00000001,  1.00000001],
       [ 1.        ,  1.        ,  1.        ,  1.        ,  1.        ],
       [ 1.        ,  1.        ,  1.        ,  1.        ,  1.        ],
       [ 1.        ,  1.        ,  1.        ,  1.        ,  0.99999999]]), array([  4.86115343e-17,   7.65182843e-17,   8.11395684e-17,
         8.63263255e-17,   8.64080682e-17,   2.17927418e-16]))
           fun: 4.8611534334221152e-17
       message: 'Optimization terminated successfully.'
          nfev: 571
           nit: 339
        status: 0
       success: True
             x: array([ 1.,  1.,  1.,  1.,  1.])

插值

import numpy as np
from scipy.interpolate import interpld

def main():
    def fun(x):
        return x+2*np.cos(x)
    sol=root(fun,0.1)
    print("ROOT:",sol.x,sol.fun)
    #Interpolation
    x=np.linspace(0,1,10)
    y=np.sin(2*np.pi*x)
    li=interpld(x,y,kind="cubic")
    x_new=np.linspace(0,1,50)
    y_new=li(x_new)
    figure()
    plot(x,y,"r")
    plot(x_new,y_new,"k")
    show()
    print(y_new)

if __name__=="__main__":
    main()

線性計算與矩陣分解

程式：
import numpy as np
from scipy import linalg as lg

def main():
    arr=np.array([[1,2],[3,4]])
    print("Det:",lg.det(arr))
    print("Inv:",lg.inv(arr))
    b=np.array([6,14])
    print("Sol:",lg.solve(arr,b))
    print("Eig:",lg.eig(arr))
    print("LU:",lg.lu(arr))
    print("QR:",lg.qr(arr))
    print("SVD:",lg.svd(arr))
    print("Schur:",lg.schur(arr))

if __name__=="__main__":
    main()

執行結果：
Det: -2.0
Inv: [[-2.   1. ]
 [ 1.5 -0.5]]
Sol: [ 2.  2.]
Eig: (array([-0.37228132+0.j,  5.37228132+0.j]), array([[-0.82456484, -0.41597356],
       [ 0.56576746, -0.90937671]]))
LU: (array([[ 0.,  1.],
       [ 1.,  0.]]), array([[ 1.        ,  0.        ],
       [ 0.33333333,  1.        ]]), array([[ 3.        ,  4.        ],
       [ 0.        ,  0.66666667]]))
QR: (array([[-0.31622777, -0.9486833 ],
       [-0.9486833 ,  0.31622777]]), array([[-3.16227766, -4.42718872],
       [ 0.        , -0.63245553]]))
SVD: (array([[-0.40455358, -0.9145143 ],
       [-0.9145143 ,  0.40455358]]), array([ 5.4649857 ,  0.36596619]), array([[-0.57604844, -0.81741556],
       [ 0.81741556, -0.57604844]]))
Schur: (array([[-0.37228132, -1.        ],
       [ 0.        ,  5.37228132]]), array([[-0.82456484, -0.56576746],
       [ 0.56576746, -0.82456484]]))

pandas

簡介

資料分析庫

基礎資料分析技術

import numpy as np
import pandas as pd

def main():
    #Data Structure
    s=pd.Series([i*2 for i in range(1,11)])
    print(type(s))   
    dates=pd.date_range("20170301",periods=8)
    df=pd.DataFrame(np.random.randn(8,5),index=dates,columns=list("ABCDE"))
    print(df)
    #Basic
    print(df.head(3))
    print(df.tail(3))
    print(df.index)
    print(df.values)
    print(df.T)
    print(df.sort(columns="C"))
    print(df.sort_index(axis=1,ascending=False))
    print(df.describe())
    #Select
    print(type(df["A"]))
    print(df[:3])
    print(df["20170301":"20170304"])
    print(df.loc[dates[0]])
    print(df.loc["20170301":"20170304",["B","D"]])
    print(df.iloc[1:2,2:4])
    print(df.iloc[1,4])
    print(df[df.B>0][df.A<0])
    print(df[df>0])
    print(df[df["E"].isin([1,2])])

    #Set
    s1=pd.Series(list(range(10,18)),index=pd.date_range("20170301",periods=8))
    df["F"]=s1
    print(df)
    df.at[dates[0],"A"]=0
    print(df)
    df.iat[1,1]=1
    df.loc[:,"D"]=np.array([4]*len(df))
    df2=df.copy()
    df2[df2>0]=df2
    print(df2)

    #Missing Value
    df1=df.reindex(index=dates[:4],columns=list("ABCD")+["G"])
    df1.loc[dates[0]:dates[1],"G"]=1
    print(df1)
    print(df1.dropna())
    print(df1.fillna(value=2))

    #Concat
    pieces=[df[:3],df[-3:]]
    print(pd.concat(pieces))
    left=pd.DataFrame({"key":["x","y"],"value":[1,2]})
    right=pd.DataFrame({"key":["x","z"],"value":[3,4]})
    print("LEFT:",left)
    print("RIFHT:",right)
    print(pd.merge(left,right,on="key",how="left"))
    df3=pd.DataFrame({"A":["a","b","c","b"],"B":list(range(4))})
    print(df3.groupby("A").sum())

if __name__=="__main__":
    main()

時間、繪圖

import numpy as np
import pandas as pd
from pylab import *

def main():
    #Time Series
    t_exam=pd.date_range("20170301",periods=10,freq="S")
    print(t_exam)

    #Graph
    ts=pd.Series(np.random.randn(1000),index=pd.date_range("20170301",periods=1000))
    ts=ts.cumsum()
    ts.plot()
    show()

if __name__=="__main__":
    main()

scikit-learn

簡介

資料探勘建模、機器學習
機器學習與決策樹

機器學習：因子–>結果

結果：

不帶標記–>無監督學習（聚類）；帶標記–>監督學習

有限離散–>分類；連續–>迴歸

決策樹：監督學習；樹形結構
Iris資料集
- 花萼長度
- 花萼寬度
- 花瓣長度
- 花瓣寬度
- 種類：Iris Setosa（山鳶尾）、Iris Versicolour（雜色鳶尾）、Iris Virginica（維吉尼亞鳶尾）

實現

import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
from sklearn import tree
from sklearn import metrics
def main():
    #Pre-processing
    iris=load_iris()
    print(iris)
    print(len(iris["data"]))
    train_data,test_data,train_target,test_target=train_test_split(iris.data,iris.target,test_size=0.2,random_state=1)

    #Model
    clf=tree.DecisionTreeClassifier(criterion="entropy")
    clf.fit(train_data,train_target)
    y_pred=clf.predict(test_data)

    #Verify
    print(metrics.accuracy_score(y_true=test_target,y_pred=y_pred))
    print(metrics.confusion_matrix(y_true=test_target,y_pred=y_pred))

if __name__=="__main__":
    main()

keras

簡介

人工神經網路
簡單神經網路實現

Keras安裝步驟：Anaconda CMD；conda install mingw libpython；pip install keras；pip install np_utils

例項

注意：需要需要C:/user/username/.keras/keras.json，具體改後內容如下：{“backend”: “theano”,”image_data_format”: “th”,”epsilon”: 1e-07,”floatx”: “float32”}。

import numpy as np
from keras.models import Sequential
from keras.layers import Dense,Activation
from keras.optimizers import SGD
from sklearn.datasets import load_iris
from sklearn.preprocessing import LabelBinarizer
from sklearn.cross_validation import train_test_split

def main():
    pass
    iris=load_iris()
    print(iris["target"])
    LabelBinarizer().fit_transform(iris["target"])
    train_data,test_data,train_target,test_target=train_test_split(iris.data,iris.target,test_size=0.2,random_state=1)
    labels_train=LabelBinarizer().fit_transform(train_target)
    labels_test=LabelBinarizer().fit_transform(test_target)

    model=Sequential(
            [
                    Dense(5,input_dim=4),
                    Activation("relu"),
                    Dense(3),
                    Activation("sigmoid"),
            ]
            )
    # 優化器
    sgd=SGD(lr=0.01,decay=1e-6,momentum=0.9,nesterov=True)
    model.compile(optimizer=sgd,loss="categorical_crossentropy")
    model.fit(train_data,labels_train,nb_epoch=200,batch_size=40)
    print(model.predict_classes(test_data))
    #model.save_weights("D:/w")
    #model.load_weights("D:/w")

if __name__=="__main__":
    main()

Python資料分析學習總結

Python資料分析基礎 numpy 開源、資料計算擴充套件；ndarray、多維操作、線性代數 numpy使用程式： import numpy as np def main(): lst=[[1,3,5],[2,4,6]] print(type(lst)) np_lst=n

Python資料分析學習路徑圖

本文摘自同行說使用者“風一樣的男子”，原文連結：http://www.yidianzixun.com/n/0CAz84ve?s=1&appid=yidian，如涉及版權問題請及時聯絡小編！ Python是一種面向物件、直譯式計算機程式設計語言，由Guido van Rossum於1989

史上最全Python資料分析學習路徑圖

Python是一種面向物件、直譯式計算機程式設計語言，由Guido van Rossum於1989年底發明。由於他簡單、易學、免費開源、可移植性、可擴充套件性等特點，Python又被稱之為膠水語言。下圖為主要程式語言近年來的流行趨勢，Python受歡迎程度扶搖直上。圖

Python資料分析學習筆記（1）numpy模組基礎入門

numpy模組可以進行高效的資料處理，並提供了陣列的支援，很多模組都依賴他，比如pandas、scipy、matplotlib等，因此這個模組是基礎。（1）匯入： import numpy （2）建立一維和二維陣列： #建立一維陣列 x=numpy.

Python資料分析學習筆記（6）資料規約實戰--以主成分分析PCA為例

一、相關理論： 1、資料規約：產生更小且保持資料完整性的新資料集。意義在於降低無效、錯誤資料；降低儲存成本；少量且具有代表性的資料大幅加快，主要分為以下兩類： ①屬性規約：屬性合併或刪除無關維，目標是尋找最小子集使子集概率分佈儘可能與原來相同。常用方法：（

Python資料分析學習筆記——DataFrame(還在更新中)

pandas的官方文件 1.DataFrame DataFrame是一個表格型的資料結構，它含有一組有序的列，每列可以是不同的值型別（數值、字串、布林值等）。DataFrame既有行索引也有列索引，它可以被看做由Series組成的字典（共用同一個索引）。 DataFrame可以通過類

python資料分析學習筆記七

第七章訊號處理與時間序列（需要統計學知識） 1 statsmodels 子庫示例程式碼如下 import pkgutil as pu import pydoc import statsmodels as sm # statmodels版本號 print("statm

python 資料分析學習筆記（第三章）

boxplot 箱形圖 catering_sale = '../data/catering_sale.xls' #餐飲資料 data = pd.read_excel(catering_sa

python資料分析學習筆記九

第九章分析文字資料和社交媒體 1 安裝nltk 略 2 濾除停用字姓名和數字示例程式碼如下: import nltk # 載入英語停用字語料 sw = set(nltk.corpus.stopwords.words('english')) print('Sto

「機器學習」Python資料分析之Numpy進階

請點選此處輸入圖片描述進階廣播法則(rule) 廣播法則能使通用函式有意義地處理不具有相同形狀的輸入。廣播第一法則是，如果所有的輸入陣列維度不都相同，一個“1”將被重複地新增在維度較小的陣列上直至所有的陣列擁有一樣的維度。廣播第二法則確定長度為1的陣列沿著特

「機器學習」Python資料分析之Numpy

請點選此處輸入圖片描述 NumPy的主要物件是同種元素的多維陣列。這是一個所有的元素都是一種型別、通過一個正整數元組索引的元素表格(通常是元素是數字)。在NumPy中維度(dimensions)叫做軸(axes)，軸的個數叫做秩(rank)。例如，在3D空間一個點的座標[1,

Python資料分析基礎教程：NumPy學習指南（第2版） pdf 下載

罕見的NumPy中文入門教程，Python資料分析優選從基礎的知識講起，手把手帶你進入大資料探勘領域囊括大量具有啟發性與實用價值的實戰案例。內容簡介　　《圖靈程式設計叢書;Python資料分析基礎教程：NumPy學習指南（第2版）》是NumPy的入門教程，主要介紹NumPy以及相關

分享《Python資料分析基礎教程：NumPy學習指南(第2版)》高清中文PDF+英文PDF+原始碼

下載：https://pan.baidu.com/s/1YSD97Gd3gmmPmNkvuG0eew更多資料分享：http://blog.51cto.com/3215120 《Python資料分析基礎教程：NumPy學習指南(第2版)》高清中文PDF+高清英文PDF+原始碼高清中文版PDF，249頁，帶

分享《Python資料分析基礎教程：NumPy學習指南(第2版)》高清中文PDF+高清英文PDF+原始碼

下載：https://pan.baidu.com/s/1YSD97Gd3gmmPmNkvuG0eew 更多分享資料：https://www.cnblogs.com/javapythonstudy/ 《Python資料分析基礎教程：NumPy學習指南(第2版)》高清中文PDF+高清英文PDF+原始碼高清