数据科学家需要具备的知识   
    

完整的数据分析流程    

观测,变量和数据矩阵    

变量的类型    
数值型+分类型    

变量之间的关系          
变量类型不同,分析方法也不同  

数值变量的特征与可视化       
集中趋势量
分散趋势量
  
稳健统计量:受到均值的影响是否大
     
可视化一个变量:箱图 四分位点 以及  极值的定义

分类变量的特征与可视化

两个分类变量的关系  
列联表 +相对频率表   

两个分类变量的关系(可视化):分段条形图,相对频率分段条形图   


一个分类变量、一个数值变量的关系:并排箱图    

小结  

三大绘图系统
- 基本绘图系统
 - Lattice绘图系统
 - ggplot2绘图系统
 
     

基本绘图系统
- 绘图函数 -> graphics包
 - hist/ boxplot/ points/lines/text/title/axis
 - 柱状图/箱图/点/线/文字/命名/坐标轴
 
plot  
xlab/ylab/lwd/lty/pch/col   x标签/y标签/线宽/线型/点型/颜色    
全局参数 : bg/mar/las/mfrow/mfcol   背景颜色/边距/横竖排版/分行列按行/列填充

实例
hist(airquality$Wind,xlab = "Wind")
boxplot(airquality$Wind,xlab="Wind",ylab="speed(mph)")
boxplot(Wind~Month,airquality,xlab="Month",ylab="speed(mph)")
plot(airquality$Wind,airquality$Temp)
with(airquality,plot(Wind,Temp))
title(main="Wind and Temp in NYC")
with(airquality,plot(Wind,Temp,
                     main="Wind and Temp in NYC",
                     type = "n"))
with(subset(airquality,Month==9),
     points(Wind,Temp,col = "red"))
with(subset(airquality,Month==5),
     points(Wind,Temp,col = "blue"))
with(subset(airquality,Month==8),
     points(Wind,Temp,col = "black"))
with(subset(airquality,Month %in% c(6,7,8)),
     points(Wind,Temp,col = "black"))
fit <- lm(Temp ~ Wind,airquality)
abline(fit,lwd=2)
#添加图例
legend("topright", pch  = 1, cex = 1,
       col = c("red", "blue", "black"),
       legend = c("sep", "May", "Other"))

全局参数
par("bg")#背景颜色
par("col")#颜色
par("mar")#(bottom,left,right, right)
par("mfrow")
par("mfcol")
par(mfrow = c(1,2))
hist(airquality$Temp)
hist(airquality$Wind)
par(mfrow = c(1,1))
boxplot(airquality$Temp)
Lattice绘图系统

实例
library(lattice)
xyplot(Temp ~ Ozone, data = airquality)
airquality$Month <- factor(airquality$Month)
xyplot(Temp ~ Ozone | Month, data = airquality,
       layout = c(5,1))
q <- xyplot(Temp ~ Wind, data = airquality)
print(q)
set.seed(1)
x <- rnorm(100)
f <- rep(0:1, each=50)
y <- x + f - f * x + rnorm(100, sd=0.5)#使用随机数时,切记使用种子,保证后期检查,纠错方便.
f <- factor(f, labels = c("Group1", "Group2"))
xyplot(y ~ x | f, layout = c(2, 1))
xyplot(y ~ x | f, panel = function(x,y){
  panel.xyplot(x,y)
  panel.abline(v = mean(x), h = mean(y), lty = 2)
  panel.lmline(x,y, col = "red")
})

ggplot2 绘图系统

实例
library(ggplot2)
airquality$Month <- factor(airquality$Month)
qplot(Wind, Temp, data = airquality, col = Month, shape = Month, size = Month,
      xlab = "Wind(mph)",
      ylab = "Temperature",
      main = "Wind vs.Temperature"
)
qplot(Wind, Temp, data = airquality,
      geom = c("point", "smooth"))
qplot(Wind, Temp, data = airquality,
      facets = Month~.)
qplot(Wind, Temp, data = airquality,
      facets = .~Month)
qplot(Wind, data = airquality, facets = .~Month)
qplot(Wind, data = airquality, fill = Month)
qplot(Wind, data = airquality, geom = "dotplot")

ggplot函数的使用
library(ggplot2)
ggplot(airquality, aes(Wind, Temp))+
  geom_point(aes(color = factor(Month), group = 1,alpha = 0.4, size = 5))+
  geom_smooth(method = "lm", se = F, aes(group = 1))#前一个group只输出群体拟合,后一个控制再做一条群体拟合
ggplot(airquality, aes(Wind, Temp)) + 
  geom_point()+
  geom_smooth(method = "lm", se = F, aes(group = 1))
library(RColorBrewer)
myColors<-c(brewer.pal(5,"Dark2"),"black")
ggplot(airquality, aes(Wind, Temp, col = factor(Month))) + 
  geom_point()+
  geom_smooth(method = "lm", se = F, aes(group = 1))+ 
  scale_color_manual("Month", values = myColors)+
  facet_grid(.~Month)+
  theme_classic()


R语言绘图之颜色


library(RColorBrewer)
pal<-colorRamp(c("red","blue"))
pal(0)
pal(1)
pal(0.5)
pal(seq(0,1,len=10))
pal<-colorRampPalette(c("red","blue"))
pal(0)
pal(1)
pal(0.5)
pal(10)
brewer.pal.info
cols<-brewer.pal(3,"Greens")
cols
pal<-colorRampPalette(cols)
pal
image(volcano,col = pal(20))
display.brewer.pal(3,"Greens")
图形设备            
         
