01 July 2015
Macro-socio-ecology-variables






0.0.1 UNIQUE VALUES OF SOCIAL VARIABLES

library(ggplot2);library(reshape);library(Hmisc)
wd<-"G:/Documents/PostDocKVA/Labbook/projects/macro-socio-ecology"
QoG.wd<-"G:\\Documents\\PostDocKVA\\Data\\QOG"
WB.poverty.wd<-"G:\\Documents\\PostDocKVA\\Data\\WorldBank\\PovertyEquityDB"
eora.wd<-"G:/Documents/PostDocKVA/Data/Eora" ### eora data directory

setwd(eora.wd)
### Reading in .csv file with annual gdp and population sizes
gdppop.df<-read.csv("gdppop.csv",header=TRUE,skip=1) #skipping the first line which includes a description of the file
### Reading in .csv file with various regional memberships
region.df<-read.csv("regionmembership.csv",header=T,skip=1) #skipping the first line which includes a description of the file
continent.df<-region.df[which(region.df[,"region_type"]=="Continents"),c("region_name","name","a3")]
GDP.df<-region.df[which(region.df[,"region_type"]=="GDP Regions"),c("region_name","name","a3")]

#ConsumTrend.df<-merge(ConsumTrend.df,continent.df,by.x="Country",by.y="name",all.z=TRUE)

year.min<-1992
year.max<-2008

setwd(wd)

var.df<-read.csv("social_variables.csv",header=T,stringsAsFactors = FALSE,na.strings="")

var.df<-var.df[which(is.na(var.df[,"varcode"])==FALSE),]

data.dirs<-unique(var.df$dir)
data.files<-unique(var.df$file)

## reading in QoG data

if(QoG.wd %in% data.dirs){
  
setwd(QoG.wd)  
QoG.basts.df<-read.csv("qog_bas_ts_jan15.csv",header=T,stringsAsFactors = FALSE,na.strings="")  

QoG.sel.vars<-unique(var.df[which(var.df$dataset == "QoG"),"varcode"])


QoG.var.df<-var.df[which(var.df$dataset == "QoG"),]


QoG.sel.df<-QoG.basts.df[,c("ccode","cname","year","ccodealp","cname_year","ccodealp_year","ccodecow","ccodewb","version",QoG.sel.vars)]

#c("ccode","cname","year","ccodealp","cname_year","ccodealp_year","ccodecow","ccodewb","version",QoG.sel.vars)[c("ccode","cname","year","ccodealp","cname_year","ccodealp_year","ccodecow","ccodewb","version",QoG.sel.vars)%in%names(QoG.basts.df)==FALSE]

for(i in 1:length(QoG.sel.vars)){

row.i<-which(QoG.var.df[,"varcode"]  == QoG.sel.vars[i])  

#print(row.i)

if(is.na(row.i) == FALSE){  
QoG.sel.df[,paste(QoG.sel.vars[i])] <- QoG.sel.df[,QoG.sel.vars[i]]*QoG.var.df[row.i,"multiplier"]
}  




QoG.sel.df<-QoG.sel.df[which(QoG.sel.df$year %in% c(year.min:year.max)),]


}

}

### CALCULATING WITHIN COUNTRY STANDARD DEVIATION

country.df<-data.frame("country"=sort(unique(QoG.sel.df$ccodealp)))
#country.df[,paste(QoG.sel.vars,"mean",sep=".")]<-NA
#country.df[,paste(QoG.sel.vars,"median",sep=".")]<-NA


QoG.sel.df<-QoG.sel.df[order(QoG.sel.df$ccodealp),]

QoG.sd<-as.data.frame(apply(QoG.sel.df[,QoG.sel.vars],2,function(x,y=QoG.sel.df$ccodealp) tapply(x,y,function(z) abs(sd(z,na.rm=TRUE)/mean(z,na.rm=TRUE)))))

#names(QoG.sd)<-paste(names(QoG.sd),"sd",sep=".")

country.df<-as.data.frame(cbind(country.df,QoG.sd))
#country.df[,paste(QoG.sel.vars,"sd",sep=".")]<-QoG.sd

country.melt.sd.df<-melt(country.df[,c("country",names(QoG.sd))])

#country.melt.sd.df[,"variable"]<-gsub(".sd","")
country.melt.sd.df<-merge(country.melt.sd.df,var.df,by.x="variable",by.y="varcode",all.x=TRUE)

#x11()
ggplot(country.melt.sd.df[-which(country.melt.sd.df[,"variable"]%in%c("cam_inclusive","wbgi_cce")),],aes(x=variable,y=value))+geom_violin()+facet_wrap(~social,scales="free",ncol=2)+ylab("CV")

####

### CALCULATING WITHIN COUNTRY UNIQUE VALUES

country.df<-data.frame("country"=sort(unique(QoG.sel.df$ccodealp)))
#country.df[,paste(QoG.sel.vars,"mean",sep=".")]<-NA
#country.df[,paste(QoG.sel.vars,"median",sep=".")]<-NA


QoG.sel.df<-QoG.sel.df[order(QoG.sel.df$ccodealp),]

QoG.unique<-as.data.frame(apply(QoG.sel.df[,QoG.sel.vars],2,function(x,y=QoG.sel.df$ccodealp) tapply(x,y,function(z) length(which(is.na(unique(z,na.rm=TRUE))==FALSE)))))

#names(QoG.sd)<-paste(names(QoG.sd),"sd",sep=".")

country.df<-as.data.frame(cbind(country.df,QoG.unique))
#country.df[,paste(QoG.sel.vars,"sd",sep=".")]<-QoG.sd

country.melt.unique.df<-melt(country.df[,c("country",names(QoG.unique))])

#country.melt.sd.df[,"variable"]<-gsub(".sd","")
country.melt.unique.df<-merge(country.melt.unique.df,var.df,by.x="variable",by.y="varcode",all.x=TRUE)

#x11()
ggplot(country.melt.unique.df[-which(country.melt.unique.df[,"variable"]%in%c("cam_inclusive","wbgi_cce")),],aes(x=variable,y=value))+geom_violin()+facet_wrap(~social,scales="free",ncol=2)+ylab("unique values")

### CALCULATING WITHIN COUNTRY UNIQUE VALUES PER TIME SERIES LENGTH

country.df<-data.frame("country"=sort(unique(QoG.sel.df$ccodealp)))

QoG.sel.df<-QoG.sel.df[order(QoG.sel.df$ccodealp),]

#QoG.relunique<-as.data.frame(apply(QoG.sel.df[,QoG.sel.vars],2,function(x,y=QoG.sel.df$ccodealp) tapply(x,y,function(z) length(which(is.na(unique(z,na.rm=TRUE))==FALSE))/
#                                                                                                       length(which(is.na(z)==FALSE)))))

#QoG.relunique<-as.data.frame(apply(QoG.sel.df[,QoG.sel.vars],2,function(x,y=QoG.sel.df$ccodealp) tapply(x,y,function(z,w=length(which(is.na(unique(z))==FALSE))) ifelse(w>0,
#                                                                                                      length(which(is.na(unique(z))==FALSE))/
#                                                                                                       length(which(is.na(z)==FALSE)),NA)
#                                                                                                      )))


QoG.relunique<-as.data.frame(apply(QoG.sel.df[,QoG.sel.vars],2,function(x,y=QoG.sel.df$ccodealp) tapply(x,y,function(z)
  length(which(is.na(z)==FALSE))
                                                                                                      )))

#QoG.relunique<-as.data.frame((as.matrix(QoG.unique)-1)/as.matrix(QoG.relunique))
QoG.relunique<-as.data.frame(as.matrix(QoG.unique)/as.matrix(QoG.relunique))



#names(QoG.sd)<-paste(names(QoG.sd),"sd",sep=".")

country.df<-as.data.frame(cbind(country.df,QoG.relunique))
#country.df[,paste(QoG.sel.vars,"sd",sep=".")]<-QoG.sd

country.melt.relunique.df<-melt(country.df[,c("country",names(QoG.relunique))])

#country.melt.sd.df[,"variable"]<-gsub(".sd","")
country.melt.relunique.df<-merge(country.melt.relunique.df,var.df,by.x="variable",by.y="varcode",all.x=TRUE)


#x11()
ggplot(country.melt.relunique.df[-which(country.melt.relunique.df[,"variable"]%in%c("cam_inclusive","wbgi_cce")),],
       aes(x=paste(social,variable),y=value))+geom_boxplot()+ylab("unique value ratio")+coord_flip()#+facet_wrap(~social,ncol=2)






0.0.2 MEDIAN VALUES IN FIRST AND SECOND TIME PERIOD

QoG.sel.df[,"period"]<-NA
QoG.sel.df[,"period"]<-ifelse(QoG.sel.df[,"year"]<2000,"a",ifelse(QoG.sel.df[,"year"]>1999,"b",NA))


QoG.sel.df<-QoG.sel.df[order(QoG.sel.df$ccodealp),]

## MEDIAN FIRST PERIOD
QoG.sd.first<-as.data.frame(apply(QoG.sel.df[which(QoG.sel.df$year %in% c(1992:1999)),QoG.sel.vars],2,function(x,y=QoG.sel.df[which(QoG.sel.df$year %in% c(1992:1999)),"ccodealp"]) tapply(x,y,function(z) median(z,na.rm=TRUE))))

QoG.sd.first[,c("ccodealp","year")]<-cbind(rownames(QoG.sd.first),rep(1990,nrow(QoG.sd.first)))
QoG.sd.first.melt<-melt(QoG.sd.first,id.var=c("ccodealp","year"))
names(QoG.sd.first.melt)<-c("ccodealp","first.period","variable","first.value")

## MEDIAN SECOND PERIOD
QoG.sd.second<-as.data.frame(apply(QoG.sel.df[which(QoG.sel.df$year %in% c(2000:2008)),QoG.sel.vars],2,function(x,y=QoG.sel.df[which(QoG.sel.df$year %in% c(2000:2008)),"ccodealp"]) tapply(x,y,function(z) median(z,na.rm=TRUE))))

QoG.sd.second[,c("ccodealp","year")]<-cbind(rownames(QoG.sd.second),rep(2000,nrow(QoG.sd.second)))
QoG.sd.second.melt<-melt(QoG.sd.second,id.var=c("ccodealp","year"))
names(QoG.sd.second.melt)<-c("ccodealp","second.period","variable","second.value")

## MANN WHITNEY TEST
MUtest.list<-by(QoG.sel.df,QoG.sel.df[,"ccodealp"],function(x) apply(x[,QoG.sel.vars],2,function(y,z=x[,"period"],country=x[1,"ccodealp"],
                                                                                    w1=length(which(is.na(y[which(x$year %in% c(1992:1999))])==FALSE)),
                                                                                    w2=length(which(is.na(y[which(x$year %in% c(2000:2008))])==FALSE))) #print(c(w1,w2))
                                                        if(w1 > 2 & w2 > 2){c(unlist(kruskal.test(y, factor(z)))[1:3],country)} else {c(rep(NA,3),country)}#pairwise.wilcox.test
                                                        ))

MUtest.df<-as.data.frame(t(do.call(cbind,lapply(MUtest.list,function(x) rbind(names(as.data.frame(x)),x)
        ))))

names(MUtest.df)<-c("varcode","KW.chisq","KW.dfres","KW.pval","ccodealp")


###############################



## MERGING MEDIANS OF FIRST AND SECOND PERIOD AND CALCULATING DIFFERENCE IN MEDIAN VALUE
QoG.sd.melt<-merge(QoG.sd.first.melt,QoG.sd.second.melt,by=c("ccodealp","variable"),all=TRUE)
QoG.sd.melt[,"diff.value"]<-QoG.sd.melt[,"second.value"]-QoG.sd.melt[,"first.value"]

## MERGING RESULTS FROM KRUSKAL WALLIS TESTS
QoG.sd.melt<-merge(QoG.sd.melt,MUtest.df,by.x=c("ccodealp","variable"),by.y=c("ccodealp","varcode"),all.x=TRUE)



### ADDING INFORMATION ABOUT SOCIAL VARIABLES
QoG.sd.melt<-merge(QoG.sd.melt,var.df,by.x="variable",by.y="varcode",all.x=TRUE)

### ADDING GEOGROAPHICAL AND ECONOMIC GROUPINGS
QoG.sd.melt<-merge(QoG.sd.melt,QoG.sel.df[,c("ccode","cname","ccodealp")],by="ccodealp",all.x=TRUE)

QoG.sd.melt<-merge(QoG.sd.melt,continent.df[,c("a3","region_name")],by.x="ccodealp",by.y="a3",all.x=TRUE)
names(QoG.sd.melt)[grep("region_name",names(QoG.sd.melt))]<-"continent"

QoG.sd.melt<-merge(QoG.sd.melt,GDP.df[,c("a3","region_name")],by.x="ccodealp",by.y="a3",all.x=TRUE)
names(QoG.sd.melt)[grep("region_name",names(QoG.sd.melt))]<-"income.region"


QoG.sd.melt[,"continent2"]<-strtrim(QoG.sd.melt[,"continent"], 2)
QoG.sd.melt[,"income.region2"]<-strtrim(QoG.sd.melt[,"income.region"], 2)
QoG.sd.melt[,"income.region4"]<-strtrim(QoG.sd.melt[,"income.region"], 4)

QoG.sd.melt.plot<-QoG.sd.melt[which(is.na(QoG.sd.melt[,"continent"])==FALSE & is.na(QoG.sd.melt[,"income.region"])==FALSE),]
# all toghether
#x11()

ggplot(QoG.sd.melt.plot[-which(QoG.sd.melt.plot[,"variable"]%in%c("cam_inclusive","wbgi_cce")),],aes(x=variable,y=diff.value))+geom_boxplot()+facet_wrap(~social,scales="free",ncol=2)+ylab("change in median 2000's - 1990's")