library(mvtnorm)# col.normlibrary(tilting)# col.normlibrary(ggplot2)library("np")#npreg: you may need to install ithpdata<-read.csv("./data/dataset-hubway2.csv",header=TRUE,sep=",",quote="\"",dec=".")hpdata$TOWN<-NULLhpdata$year<-NULL#zero-variance#write.table(hpdata,"./output/dataset-hubway3.csv", sep = ",",dec = ".")colnames(hpdata)
Nonparametric Kernel Methods for Mixed Datatypes (version 0.60-10)
[vignette("np_faq",package="np") provides answers to frequently asked questions]
[vignette("np",package="np") an overview]
[vignette("entropy_np",package="np") an overview of entropy-based methods]
Warning message in file(file, ifelse(append, "a", "w")):
“cannot open file './output/dataset-hubway3.csv': No such file or directory”
Error in file(file, ifelse(append, "a", "w")): cannot open the connection
Traceback:
1. write.table(hpdata, "./output/dataset-hubway3.csv", sep = ",",
. dec = ".")
2. file(file, ifelse(append, "a", "w"))
In [45]:
# PCA Scree plot without standardizing datahpca_cov<-prcomp(hpdata,scale=FALSE)# We can extract the information summarized above (and much more) # from the attributes of the object hpca_covstandard_deviation_of_each_component<-hpca_cov$sdevvar_per_dim<-standard_deviation_of_each_component^2var_tot<-sum(var_per_dim)var_totvar_per_dim/var_totvar_prop<-var_per_dim/sum(var_per_dim)var_propcum_var<-cumsum(var_prop)cum_varplot(cum_var,xlab="Principal component",ylab="Proportion of variance explained",ylim=c(0,1),type='b')
8298462.56254009
0.604277079010252
0.289490773781669
0.0547284503622438
0.0208837985727226
0.00807095342683796
0.00752787785881026
0.00515652076565351
0.00415842007503802
0.0027707562726213
0.000885404693833313
0.000529822683554946
0.000340909314077188
0.000292504731544651
0.000254839153032317
0.000180542047466206
0.000126918406239684
0.000110191072847556
6.73393420643508e-05
5.22717833553464e-05
3.76655747196708e-05
2.18217307948317e-05
9.48624010997684e-06
8.36822055040508e-06
7.04940971690849e-06
4.48035405488206e-06
1.96209342640623e-06
1.79682472877614e-06
8.8844562469251e-07
4.61726299002591e-07
2.92303519979804e-07
2.04167002152802e-07
1.18758877473866e-07
1.81891767363846e-08
9.24575963168155e-09
3.36177378469594e-09
1.98160423349925e-32
3.89285291618039e-33
3.89285291618039e-33
3.89285291618039e-33
2.00135064939812e-33
0.604277079010252
0.289490773781669
0.0547284503622438
0.0208837985727226
0.00807095342683796
0.00752787785881026
0.00515652076565351
0.00415842007503802
0.0027707562726213
0.000885404693833313
0.000529822683554946
0.000340909314077188
0.000292504731544651
0.000254839153032317
0.000180542047466206
0.000126918406239684
0.000110191072847556
6.73393420643508e-05
5.22717833553464e-05
3.76655747196708e-05
2.18217307948317e-05
9.48624010997684e-06
8.36822055040508e-06
7.04940971690849e-06
4.48035405488206e-06
1.96209342640623e-06
1.79682472877614e-06
8.8844562469251e-07
4.61726299002591e-07
2.92303519979804e-07
2.04167002152802e-07
1.18758877473866e-07
1.81891767363846e-08
9.24575963168155e-09
3.36177378469594e-09
1.98160423349925e-32
3.89285291618039e-33
3.89285291618039e-33
3.89285291618039e-33
2.00135064939812e-33
0.604277079010252
0.893767852791922
0.948496303154165
0.969380101726888
0.977451055153726
0.984978933012536
0.99013545377819
0.994293873853228
0.997064630125849
0.997950034819682
0.998479857503237
0.998820766817315
0.999113271548859
0.999368110701892
0.999548652749358
0.999675571155597
0.999785762228445
0.999853101570509
0.999905373353865
0.999943038928584
0.999964860659379
0.999974346899489
0.99998271512004
0.999989764529757
0.999994244883811
0.999996206977238
0.999998003801967
0.999998892247591
0.99999935397389
0.99999964627741
0.999999850444412
0.99999996920329
0.999999987392467
0.999999996638226
1
1
1
1
1
1
In [5]:
apply(hpdata,2,mean)apply(hpdata,2,var)
station_id
108.403
month
8.676
day
17.814
hour
16.461
num_of_pickups
1.689
num_of_dropoffs
1.582
day_of_week
4.119
weekend
0.305
num_bikes_available
6.17400909090909
num_bikes_disabled
0.550924242424242
num_docks_available
11.0545666666667
num_docks_disabled
0.0155
TAZ_id
342.766
Tot_Pop
1152.306
HH_Pop
970.562
HH
466.605
Income_low
166.012
Income_mid_low
119.076
Income_mid_high
90.184
Income_high
91.333
Worker0
121.394
Worker1
195.038
Worker2
118.083
Worker3p
32.09
HHSize1
187.066
HHSize2
156.064
HHSize3
62.662
HHSize4
36.875
HHSize5p
23.938
Veh0
164.424
Veh1
206.3
Veh2
51.81
Veh3p
44.071
Tot_Vehs
457.168
Age0to4_enrollment
15.874
Age5to14_enrollment
71.945
Age15to18_enrollment
46.089
Age19plus_commuters
399.697
Age19plus_dorms
160.874
num_bikes
6.72493333333333
station_id
4190.07266366366
month
0.219243243243243
day
77.1125165165165
hour
16.2527317317317
num_of_pickups
7.65793693693694
num_of_dropoffs
5.33260860860861
day_of_week
3.97881781781782
weekend
0.212187187187187
num_bikes_available
27.0836950869235
num_bikes_disabled
0.71278812765934
num_docks_available
39.2460889673052
num_docks_disabled
0.0323059448337226
TAZ_id
57926.4817257257
Tot_Pop
1099763.90627027
HH_Pop
975955.245401401
HH
193299.980955956
Income_low
29846.2541101101
Income_mid_low
15488.6668908909
Income_mid_high
10066.3024464464
Income_high
13211.2673783784
Worker0
15498.6193833834
Worker1
34179.2918478478
Worker2
18497.9901011011
Worker3p
2574.89279279279
HHSize1
31522.5161601602
HHSize2
23486.6265305305
HHSize3
4979.6393953954
HHSize4
2311.05042542543
HHSize5p
1828.07222822823
Veh0
28917.4977217217
Veh1
44057.7977977978
Veh2
7539.14504504504
Veh3p
2831.51947847848
Tot_Vehs
261352.878654655
Age0to4_enrollment
1876.78490890891
Age5to14_enrollment
33439.3353103103
Age15to18_enrollment
56187.874953954
Age19plus_commuters
4464921.89909009
Age19plus_dorms
862505.447571572
num_bikes
27.6643796736682
From now on, we use only the correlation-matrix-based PCA (standardized).
First, we describe with plots and comment on how the first two principal components relate to the original columns in both covariance-matrix-based PCA and correlation-matrix-based (standardized) PCA. In both cases, is it possible to give a name to each of the two principal components?
In [38]:
hpca_cor<-prcomp(hpdata,scale=TRUE)standard_deviation_of_each_component<-hpca_var$sdevvar_per_dim<-standard_deviation_of_each_component^2var_tot<-sum(var_per_dim)var_prop<-var_per_dim/sum(var_per_dim)cum_var<-cumsum(var_prop)plot(cum_var,xlab="Principal component",ylab="Proportion of variance explained",ylim=c(0,1),type='b')
# Let us plot the contribution of the original dimension to the 1st PCAPC_contr<-eigenvectors[,c("PC1")]# PC_contr# We order by the magnitude of the contribution# We use the - sign because we want a descending orderord<-order(-abs(PC_contr))PC_contr<-PC_contr[ord]#PC_contr# We just select the 5 highest contributing dimensions (highest loadings)PC_contr<-PC_contr[1:5]PC_contrbarplot(PC_contr,main="Contribution to the 1st component",xlab="Original Dimensions")
HH
-0.242645523213089
HH_Pop
-0.242500207248033
Tot_Vehs
-0.235198898109267
Income_mid_low
-0.234366462782011
HHSize3
-0.233138329774904
Since the household size-related variables have negative loadings, we can describe the first principal component as the "Small low-car-usage household"
In [52]:
# Second principal component vectorPC_contr<-eigenvectors[,c("PC2")]# We order by the magnitude of the contribution# We use the - sign because we want a descending orderord<-order(-abs(PC_contr))PC_contr<-PC_contr[ord]# We just select the 5 highest contributing dimensionsPC_contr<-PC_contr[1:5]options(repr.plot.width=12,repr.plot.height=8)barplot(PC_contr,main="Contribution to the 2nd component",xlab="Original Dimensions")
Based on the value of the important loadings, the second component can be described as "Bike Usage" or "Bike unavailability"
We show with plots and comments if the variable num of bikes can be explained by the first two principal components. We visualize in the same plot the first 2 components and some indication of the number of pickups.
In [64]:
options(repr.plot.width=15,repr.plot.height=10)df<-data.frame(SmallHH=scores[,1],Unavailability=scores[,2],Num.pickups=hpdata$num_of_pickups)#quantile(df$pickups, 0.25)df$Pickups.yes<-cut(df$Num.pickups,c(-Inf,0,Inf))# create category for pickups (yes) or no pickups#df <- df[which(hpdata$hour>=8 & hpdata$hour<=10),]ggplot(df,aes(x=SmallHH,y=Unavailability))+geom_point(aes(size=Num.pickups,color=Pickups.yes),alpha=0.75)+theme_gray(base_size=25)