/
PCA_demo.R
153 lines (121 loc) · 8.39 KB
/
PCA_demo.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# Principle Component Analysis in R for Engineers and Geoscientists New to R
# Michael Pyrcz, University of Texas at Austin, Twitter @GeostatsGuy
# This will be used in my Introduction to Geostatistics Undergraduate Class
# It is assumed that students have no previous R experience. We take an multivariate dataset
# and only retain the two variables for a simple demonstration of PCA.
# Load the required libraries, you may have to first go to "Tools/Install Packages..." to install these first
library(plyr) # splitting, applying and combining data by Hadley Wickham
library(ggplot2) # for the custom biplot
library(lattice) # for the matrix scatter plot
library(corrplot) # for the corrplot correlation plot
# Declare functions
# no functions required in this demonstration
# Set the working directory, I always like to do this so I don't lose files and to simplify subsequent read and writes
setwd("C:/PGE337/PCA") # choose your local working directory
# Read the data table from a comma delimited file - data on GitHub/GeostatsGuy/GeoDataSets
mydata = read.csv("unconv_MV.csv") # read in comma delimited data file
# Let's visualize the first several rows of our data so we can make sure we successfully loaded it
head(mydata) # show the first several rows of a data table in the console
# Check out the summary statistics for each column
summary(mydata) # summary statistics for the multivariate data file
# Calculate the correlation matrix
mydata_noindex <- mydata[,2:length(mydata)] # remove the first column with the well index
cor_matrix <- round(cor(mydata_noindex),2) # calculate a mxm matrix with the correlation coeficients
cor_matrix
# Let's use the corrplot package to make a very nice correlation matrix visualization
corrplot(cor_matrix, method = "circle") # graphical correlation matrix plot
# Now let's view the scatterplot matrices from the lattice Package
splom(mydata[c(2,3,4,5,6,7,8)],col=rgb(0,0,0,50,maxColorValue=255), pch=19,main = "Unconventional Dataset")
# This dataset has varriables from 1,000 unconventional wells including well average porosity, log transform
# of permeability (to linearize the relationships with other variables), accoustic impedance (kg/m2s*106), brittness ratio (%),
# total organic carbon (%), vitrinite reflectance (%), and production (MCFPD)
# Let's start simple with a bivariate (2 variable) problem
mydata_por_perm <- mydata[1:100,2:3] # new dataframe with only 1st 100 wells of por and logperm
head(mydata_por_perm) # check the new dataframe
# Look at a scatter plot of porosity vs. log permeability
plot(LogPerm~Por, mydata_por_perm, main="Log Permeability vs. Porosity",
xlab="Porosity (%)", ylab="Log Permeability", col = alpha("black",0.3), pch=19)
# With the log of permeability we have a very nice linear relationship with porosity
# We are ready to perform PCA with porosity and log of permeability
pca <- prcomp(mydata_por_perm, scale=TRUE) # this does the PCA
# Note, we should scale the data to all have a standard deviation of 1.0. Otherwise the difference
# between the scale of porosity and permeability would have a significant impact. We should always
# scale unless the two variables have the same units.
# Let's see what's in the PCA output
names(pca)
# This includes:
# "sdev" is the scaling applied to get all the standard deviations to 1.0 (m)
# "center" is the means that we subtracted from each variable to get them centered (m)
# "rotation" includes the component / factor loadings (mxm)
# "x" includes the principle component scores for all the data x all the principal components (nxm)
# let's look at the average and scaling factor applied to the data before PCA
pca$center # the means substracted from variables
pca$scale # the standarization factor
# These are m sized arrays as they are distinct for each feature
# Now, let's look at the principle component loadings are in the rotation matrix
pca$rotation
# the 'x' matrix provides us with the columns with the kth principal component score vector
dim(pca$x)
# no suprise there is number of wells columns each nvariables long
# note the porosity and permeabilities are centered first
head(pca$x)
# Let's go head and reverse the PCA with with 2 princple components to check
nComp = 2
Xhat2 = t(t(pca$x[,1:nComp] %*% t(pca$rotation[,1:nComp])) * pca$scale + pca$center)
# Let's plot the original data vs. our back transformation using both PC1 and PC2
# We can confirm that we got back to the original data; therefore, our method works.
par(mfrow=c(2,1))
plot(LogPerm~Por, mydata_por_perm, main="Log Permeability vs. Porosity",
xlab="Porosity (%)", ylab="Log Permeability", xlim = c(8,22), ylim = c(0,3),col = alpha("black",0.3), pch=19)
plot(LogPerm~Por, Xhat2, main="LogPerm vs. Porsity (from PC1 and PC2)",
xlab="Porosity (%)", ylab="LogPerm", xlim = c(8,22), ylim = c(0,3),col = alpha("black",0.3), pch=19)
# The two cross plots should look exactly the same. If so the method is working.
# Now let's attempt dimensional reduction by only retaining the first principle component
nComp = 1 # we only retain PC1
Xhat1 = t(t(pca$x[,1:nComp] %*% t(pca$rotation[,1:nComp])) * pca$scale + pca$center)
# note "%*%" indicate matrix multiplication, otherwise "*" and "+" multiple and add over the columns and
# "t" is the transpose of the matrix, aside matrix math is really easy in R, eh!
# Let's plot original data, principal coordinate scores for 2 and 1 component and backtransform.
par(mfrow=c(2,2))
plot(LogPerm~Por, mydata_por_perm, main="Log Permeability vs. Porosity",
xlab="Porosity (%)", ylab="Log Permeability", xlim = c(8,22), ylim = c(0,3),col = alpha("black",0.3), pch=19)
plot(LogPerm~Por, Xhat1, main="LogPerm vs. Porsity (from PC1 Only)",
xlab="Porosity (%)", ylab="LogPerm", xlim = c(8,22), ylim = c(0,3),col = alpha("black",0.3), pch=19)
plot(PC2~PC1, (pca$x*-1), main="PC2 vs. PC1",
xlab="PC1", ylab="PC2", xlim = c(-4,4), ylim = c(-1,1), col = alpha("black",0.3), pch=19)
plot(rep(0,100)~PC1, (pca$x*-1), main="PC1 Only",
xlab="PC1", ylab="", xlim = c(-4,4), ylim = c(-1,1), col = alpha("black",0.3), pch=19)
# With one principle component we can describe quite a bit of the variance!
# it is useful to look at the biplot
par(mfrow=c(1,1))
biplot(pca,cex = 0.8, scale=0, pch = 21)
# This plot shows the data component scores ploted along with the feature loadings values indicated on right and top axes.
# It may be useful to just look at the factor loadings
theta <- seq(0,2*pi,length.out = 100)
circle <- data.frame(x = cos(theta), y = sin(theta))
p <- ggplot(circle,aes(x,y)) + geom_path()
loadings <- data.frame(pca$rotation,.names = row.names(pca$rotation))
p + geom_text(data=loadings,mapping=aes(x = PC1, y = PC2, label = .names, colour = .names)) +
coord_fixed(ratio=1) + labs(x = "PC1", y = "PC2")
# This is a custom ggplot for veiwing the associations of the features relative to each principal
# component. Note the each feature is balanced between the two principal coordinates.
# Calculate and visualize the variance explained
pr.var=pca$sdev^2
pve=pr.var/sum(pr.var)
pve
# new we know we can explain about 90% of the variance with a single principal component
# Let's plot the variance explained by each principle component
plot(pve , xlab="Principal Component ", ylab="Proportion of Variance Explained ", xlim = c(1,5),ylim=c(0,1),pch=19,col="red")
plot(cumsum(pve), xlab="Principal Component ", ylab=" Cumulative Proportion of Variance Explained ",xlim = c(1,5), ylim=c(0,1),pch = 19,col="red")
# The pca object has a built-in variance by principal component plot
plot(pca, type = "l", main = "Variance by Principal Component")
# Also, we can also summarize the PCA model like this
summary(pca)
# On your own try working with more variables. How few principal components can we use? How much information
# do we lose?
# Hope this was helpful,
# Michael
#
# Michael J. Pyrcz, P.Eng., Ph.D.
# Associate Professor, the University of Texas at Austin (@GeostatsGuy)
# mpyrcz@austin.utexas.edu