From d9682bb32c84284a69cdd319c6211d79e964fd73 Mon Sep 17 00:00:00 2001 From: adinathauti Date: Sun, 9 Jul 2017 11:38:32 +0000 Subject: [PATCH] Done --- build.py | 68 +++++++++++++++++++++-- build.pyc | Bin 0 -> 3228 bytes tests/__init__.pyc | Bin 0 -> 172 bytes tests/test_get_categorical_variables.pyc | Bin 0 -> 2805 bytes 4 files changed, 63 insertions(+), 5 deletions(-) create mode 100644 build.pyc create mode 100644 tests/__init__.pyc create mode 100644 tests/test_get_categorical_variables.pyc diff --git a/build.py b/build.py index c7d540b..c679a77 100644 --- a/build.py +++ b/build.py @@ -1,21 +1,79 @@ +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +dataframe = pd.read_csv('./data/employee_retention_data.csv') +#print dataframe.head(10) + + def get_categorical_variables(df): - return [] + col_names = df.columns.values + ls = [] + for ele in col_names: + #print df[ele].value_counts().index + if len(df[ele].value_counts().index)<10: + #print("{} is a categorical column as it has only {} categories\n".format(ele,len(df[ele].value_counts().index))) + ls.append(ele) + ls.append('join_date') + ls.append('quit_date') + df2 = pd.DataFrame(df,columns=ls) + #print df.info() + return df2 def get_numerical_variables(df): - return [] + col_names = df.columns.values + ls = [] + for ele in col_names: + if (df[ele].dtype == 'int64' or df[ele].dtype == 'float64') & (ele not in get_categorical_variables(df).columns.values): + ls.append(ele) + df2 = pd.DataFrame(df,columns=ls) + return df2 def get_numerical_variables_percentile(df): - pass + df_1 = get_numerical_variables(df) + dic1 = df_1.describe().values + ls = zip(*dic1) + df_2 = pd.DataFrame(ls,columns=['count','mean','std','min','25th Percentile','50th Percentile','75th Percentile','max']) + ls2 = get_numerical_variables(dataframe).columns.tolist() + df_2['variable name']=ls2 + df_2 = df_2[['variable name','mean','25th Percentile','50th Percentile','75th Percentile']] + df_2['median'] = df_2['50th Percentile'] + df_2 = df_2[['variable name','mean','median','25th Percentile','50th Percentile','75th Percentile']] + return df_2 def get_categorical_variables_modes(df): - pass + df_3 = get_categorical_variables(df) + dic = {} + for ele in df_3.columns.values: + #print('mode of {} is {}'.format(ele,df_3[ele].mode().values[0])) + if ele not in dic: + dic[ele] = df_3[ele].mode().values[0] + print dic + df_x = pd.DataFrame(dic,index=[0],columns=dic.keys()) + return df_x def get_missing_values_count(df): - pass + cols = df.columns.values + cnt = 0 + df_tmp = df + col_ls = [] + val_ls = [] + for col in cols: + col_ls.append(col) + for e in df[col].values: + if pd.isnull(e) == True: + cnt+=1 + val_ls.append(cnt) + cnt = 0 + print zip(col_ls,val_ls) + dfz = pd.DataFrame([col_ls,val_ls]) + dfz = dfz.T + dfz.rename(columns={0:'var_name',1:'missing_value_count'},inplace=True) + return dfz def plot_histogram_with_numerical_values(df): diff --git a/build.pyc b/build.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0ec0db3657f7dc7f7402f007c98b50f6bf6acd66 GIT binary patch literal 3228 zcmcIm-EJF26h5%m35=z|`+JXpG1rcyj1+=XcHBx0ZI}>M|^?I{2 zE=d(jNCAlp9)&01b$A3G0KRj^n9d#JEDbA;u!< zrWi}4TVh=0iJtY^nAHA-zb$tQ=AFJye9|}LBAXl=GboKWxwn&i!2Ua_dzC7LE#R%= z{SvSH8O;gN2#=zfq9M^NqNC60MA7EU2Q;gZk2vC_M%_`3P7zq=m16-1L}iywBHkCe z;Z%oR!m}NGVT)@1B?R~eZyw|6I{)%isTqtqHp9!>vK_RZ_cfYWS+|jIhg0i$7sfZy zmrkt)C&a&PX}q`@2({={v=-?_$8A@q-kbA5Kb}CfVRsQrcuU zGmaDG-bF)dQ8m<3v=*(XCH${OXHfID1?pd5{0G!K0qU5cu5gY7-iV4X&buNy02M$8 z2?X4MH3v<4GUoxHyubtGeAE!M@skT^GXb~ekobom3#@zDB!Ml6_84ceVg^(qwLdOQ zyv!ZaEAVp)1hAN?6JOzmp^ulixr*j8(RmN!GdlTnZ2p1G`{;5!{~Q~&p;pwY5?q+B zyi&pCcNkuvK}u1As65o-DM}#XTI6g;^egHTnHLQ~j!}K6E=UK;2yy|TfJ8tXbeo2) zc~xOdFr8s)(g;qBfI7v4pLA#=eDDy}kB1A?4b_Ojy4-)|e3b=vf`(bQjCUQJ>!_}}s%`}CF-W{I;ogrh z&U4QO`F}hoAsD#%@~$X@un?tQu5lkISe*u<6 zNBcp+zo!|r7D-2t)V(@-B=U`7g`jM{?9r^ra)JZ=q{+enRfVFLPv;;#!tjSr`Pqg{ z@Gx>mQDNf44`~~Ph6&JlDgetxCOC!?I&0BMi^XPc;8If~3;3ZWiXO63Mfhm&%%>D7 z1V0m`w*-F6*OO3%T%xFPId(ml^Vm9P^C8YYobusFmqWwmMTUd5C+dGjl{W{3XoGe6 zG|Pko&&#QaKj3*d*`8;7E-|HHDGB-^w-BZlPX2J(r@5CXh=3m#V)Cs3y-ZmwicQuR zcxBM)a`5^x3CCv)6YfHCT*%I6=yNOwfoTog4{$@kKc7O^@Y#VTtg3d!Jn=0gj^7C3ggDa z-aJ55B)LwU$c4WK_c-x9cxHETwPH)qo12VYk$E{cN|WdTF3j-_IK!2LNf>iJW8Q!K s2{)gjd6NyX{{I_b3zKX>Jf@j>%oxapZ;|CaTCTU3+nx4Od$GOpH_KwI0RR91 literal 0 HcmV?d00001 diff --git a/tests/__init__.pyc b/tests/__init__.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d26765d85573ef7b3229f16cf124508a94186223 GIT binary patch literal 172 zcmZ9FK?=e!5Je-p5JAt;O`Hpecmg*f;wp^ENP}&rWTuOr)1!C+r{KyD|9$w2|CgWb z=DmB|qx)%Lc{0n&kes_Tb8~JC?hkUc4qEOBFy(^U;;0#JTwgL^B#4{{D4~CH_0$g? nPei_wQqj!CCsY(tP!bggA(yNn?1xF$wtVP0jU655e=)l+AVn(B literal 0 HcmV?d00001 diff --git a/tests/test_get_categorical_variables.pyc b/tests/test_get_categorical_variables.pyc new file mode 100644 index 0000000000000000000000000000000000000000..15b385ce68f4be3ee42afc6c64d7ffd1ce357a17 GIT binary patch literal 2805 zcmc&$+fLg+5FO`AC@r+~u3n_7Qu6|S>Km%6Qnhb97^I4*lC{_qaB%F@yGDuxPxKG_ zDgA(U&KN=jq~e7p;PK9S_HxeI+1>i@yNkc>?d?W%_N&YLqdc}JCYFDXQlgtHZkxEJ?gGUU;8EYBxJf^WPKkDV9dxnXZAtgZANh4X zDY*qPJ8a#ivAkv9h_~*?#u=Z7qck5ghXp&%T#{!Y>enNChNFtMl zN2W;3UdncY3S)ed$G#WC1Tca{rqrteAqs-1riBKep{9xXqtK#YyW5nO5&8i`csAv+ z-^EA=U`1Iv5tfd{uam0@Uhkwv$3z3^yg|n`I#g!7v#RFRs@7ChOu|!J2H2V+>Y4ca zv+&B8GM&C_;Kd{7I#L+qiE_s3{U{R$2DytM2UL4ylEwj=2EgSO#TaY3a63iGU9Cf~ z<+QJ14pjx)uDnQ%J(QAvkPq26agrH#U`m(x-}2(nj!eXUl*jCIY}WdDv39QM+COF zd|=JWAzxa*l^#HHPFa}5s!x;;M9jrJLHu$sVT-PaDep_EWtpu0E4Td z0h(1GK++tgd?CY6q`ZcN2lCwl!G(}hQet6lB4R_#jii+9kW7@7KagaUB(3(fZKZ2k z7i>Z^bO}dXMA(f|R$4VnE2=Tu3#wJC#3q?_CW~0}b5u$8Syud)!VIeftK`tebC%B_#+tb=ghSb6>}5faUKfG{t8gE}JQ6dN3K}ipro4|7YFMtcGX8-^I literal 0 HcmV?d00001