評分卡模型-(一特徵構建)
# -*- coding: utf-8 -*- """ Created on Sun Sep 16 09:24:18 2018
@author: wangxihe """
import os import pandas as pd import datetime import matplotlib.pyplot as plt import collections import numpy as np os.chdir(r'E:\spyderwork\評分卡模型\一特徵構建')
plt.rcParams['font.sans-serif']=['SimHei'] # 用來正常顯示中文標籤 plt.rcParams['axes.unicode_minus']=False # 用來正常顯示負號 #%%讀取資料 MasterData=pd.read_csv('PPD_Training_Master_GBK_3_1_Training_Set.csv',encoding='gbk') LoginData=pd.read_csv('PPD_LogInfo_3_1_Training_Set.csv',encoding='gbk') UpdateData=pd.read_csv('PPD_Userupdate_Info_3_1_Training_Set.csv',encoding='gbk') #%%處理時間格式 #LoginData['Listinginfo1']=pd.to_datetime(LoginData['Listinginfo1']) LoginData['Listinginfo1']=LoginData['Listinginfo1'].apply(lambda x :datetime.datetime.strptime(x,'%Y-%m-%d')) LoginData['LogInfo3']=LoginData['LogInfo3'].apply(lambda x :datetime.datetime.strptime(x,'%Y-%m-%d'))
#計算登入天數 LoginData['LogDay']=LoginData['Listinginfo1']-LoginData['LogInfo3'] #LoginData['LogDay']=LoginData[['Listinginfo1','LogInfo3']].apply(lambda x:x[0]-x[1]) LoginData['LogDay']=LoginData['LogDay'].dt.days #%% #檢視登入天數分佈 LoginData['LogDay'].plot(kind='hist',bins=200)
#%%#檢視登入天數分佈 plt.hist(LoginData['LogDay'],bins=300) plt.title('登入天數分佈') #%%登入方式
LoginData['LogInfo2'].value_counts() LoginData['LogInfo2'].value_counts().sort_values().plot(kind='barh') #%% def MyDiv(x,y): if y==None: return 0 elif y==0: return 0 else: return x*1.0/y #%% Tw=[7, 30, 60, 90, 120, 150, 180] cols=['LogInfo1','LogInfo2']
LoginIdxDf=pd.DataFrame({'Idx':LoginData['Idx'].drop_duplicates()})
for day in Tw: LoginData['LogTime']=LoginData['Listinginfo1']-datetime.timedelta(days=day) TempDf=LoginData[LoginData['LogInfo3']>=LoginData['LogTime']] for var in cols: #總的登入次數 TempGroupDict=TempDf.groupby('Idx')[var].count().to_dict() LoginIdxDf[str(var)+'_'+str(day)+'_totalnum']=LoginIdxDf['Idx'].apply(lambda x:TempGroupDict.get(x,0)) #不重複的登入次數 UnionTempDf=TempDf[['Idx',var]].drop_duplicates() UnionTempDict=UnionTempDf.groupby('Idx')[var].count().to_dict() LoginIdxDf[str(var) + '_' + str(day) + '_unique']=LoginIdxDf['Idx'].apply(lambda x:UnionTempDict.get(x,0)) #比例 LoginIdxDf[str(var) + '_' + str(day) + '_rate']=LoginIdxDf[[str(var)+'_'+str(day)+'_totalnum',str(var) + '_' + str(day) + '_unique']].apply(lambda x:MyDiv(x[0],x[1]),axis=1) LoginIdxDf.to_csv('Log.csv') #%% #UpdateData['ListingInfo1']=pd.to_datetime(UpdateData['ListingInfo1']) #UpdateData['UserupdateInfo2']=pd.to_datetime(UpdateData['UserupdateInfo2']) UpdateData['ListingInfo1']=UpdateData['ListingInfo1'].apply(lambda x :datetime.datetime.strptime(x,'%Y/%m/%d')) UpdateData['UserupdateInfo2']=UpdateData['UserupdateInfo2'].apply(lambda x :datetime.datetime.strptime(x,'%Y/%m/%d')) #%% UpdateData['UserupdateInfo1'].value_counts().sort_values(ascending=False) len(UpdateData['UserupdateInfo1'].value_counts()) updateTop10=UpdateData['UserupdateInfo1'].value_counts().sort_values(ascending=False).head(20).copy() updateTop10.sort_values().plot(kind='barh') #%%轉為大寫,看是否有重複 UpdateData['UserupdateInfo1']=UpdateData['UserupdateInfo1'].apply(lambda x:x.upper()) len(UpdateData['UserupdateInfo1'].value_counts())
def updateNumber(x): if x=='_MOBILEPHONE': return '_PHONE' else: return x UpdateData['UserupdateInfo1']=UpdateData['UserupdateInfo1'].apply(lambda x:updateNumber(x)) #%% UpdateIdxDf=pd.DataFrame({'Idx':UpdateData['Idx'].drop_duplicates()}) for day in Tw: UpdateData['LogTime']=UpdateData['ListingInfo1']-datetime.timedelta(days=day) TempDf=UpdateData[UpdateData['UserupdateInfo2']>=UpdateData['LogTime']] TempGroupDict=TempDf.groupby('Idx')['UserupdateInfo1'].count().to_dict() UpdateIdxDf['Update_'+str(day)+'_freq']=UpdateIdxDf['Idx'].apply(lambda x:TempGroupDict.get(x,0)) UnionTempDf=TempDf[['Idx','UserupdateInfo1']].drop_duplicates() UnionTempDict=UnionTempDf.groupby('Idx')['UserupdateInfo1'].count().to_dict() UpdateIdxDf['Update_' + str(day) + '_unique']=UpdateIdxDf['Idx'].apply(lambda x:UnionTempDict.get(x,0)) UpdateIdxDf['Update_' + str(day) + '_rate']=UpdateIdxDf[['Update_'+str(day)+'_freq','Update_' + str(day) + '_unique']].apply(lambda x:MyDiv(x[0],x[1]),axis=1) #修改重要屬性
TempsumDict=UnionTempDf.groupby('Idx')['UserupdateInfo1'].sum() for item in ['_IDNUMBER','_HASBUYCAR','_MARRIAGESTATUSID','_PHONE']: item_dict = TempsumDict.map(lambda x: int(item in x)).to_dict() UpdateIdxDf['UserupdateInfo_' + str(day) + str(item)]=UpdateIdxDf['Idx'].apply(lambda x:item_dict.get(x,0)) UpdateIdxDf.to_csv('update.csv') #%%判斷歸屬地是否一致 MasterData['city_match'] = MasterData.apply(lambda x: int(x.UserInfo_2 == x.UserInfo_4 == x.UserInfo_8 == x.UserInfo_20),axis = 1) del MasterData['UserInfo_2'] del MasterData['UserInfo_4'] del MasterData['UserInfo_8'] del MasterData['UserInfo_20']
MasterData.to_csv('master.csv',encoding = 'gbk') #%%
allData_0=pd.concat([MasterData.set_index('Idx'),UpdateIdxDf.set_index('Idx'),LoginIdxDf.set_index('Idx')],axis=1) allData_0.to_csv('Idx0.csv',encoding='gbk')
#%% LoginData['MinueDays']=LoginData[['Listinginfo1','LogInfo3']].apply(lambda x:(x[0]-x[1]).days,axis=1)
def TimeWindowSelection(df,col,tw): tw_dict={} for day in tw: tw_dict[day]=len(df[df[col]<=day]) return tw_dict tw_dict=TimeWindowSelection(LoginData,'MinueDays',[7,15,30,60,90,120,150,180]) tw_df=pd.DataFrame.from_dict(tw_dict,orient ='index')
tw_df.plot(kind='bar')
#%%
UpdateData['MinueDays']=UpdateData[['UserupdateInfo2','ListingInfo1']].apply(lambda x:(x[1]-x[0]).days,axis=1) t=collections.Counter(UpdateData['MinueDays']) hist_ListingGap = np.histogram(UpdateData['MinueDays']) hist_ListingGap = pd.DataFrame({'Freq':hist_ListingGap[0],'gap':hist_ListingGap[1][1:]}) hist_ListingGap['CumFreq'] = hist_ListingGap['Freq'].cumsum() hist_ListingGap['CumPercent'] = hist_ListingGap['CumFreq'].map(lambda x: x*1.0/hist_ListingGap.iloc[-1]['CumFreq'])
#%%
#groupby collections.Counter np.histogram concat merger