tweet<-function(toplot='bar',mlclass=FALSE,ptrain=.1,seed=3826) { #toplot: Which plot? # bar - side by side barplot of categories # crossagree - plot of crossagreement between machine learning # hist - histogram of tweets by time for group A #mlclass: If true, do machine learning classification(Processor intensive) #libraries library(irr); library(class); library(tm); library(kernlab); pt<-proc.time(); A<-read.csv('emiliafinal.csv'); B<-read.csv('abruzzofinal.csv'); A$match1<-(A$R1==A$R2); B$match1<-(B$R1==B$R2); A.nomatch<-A[A$match1==FALSE,]; A.mismatches<-dim(A.nomatch)[1]; B.nomatch<-B[B$match1==FALSE,]; B.mismatches<-dim(B.nomatch)[1]; A.ntweets<-dim(A)[1]; B.ntweets<-dim(B)[1]; #calculate the final scores A.r123<-A[,c('R1','R2','R3')]; A$final<-as.factor(apply(A.r123,1,FUN=comp3)); B.r123<-B[,c('R1','R2','R3')]; B$final<-as.factor(apply(B.r123,1,FUN=comp3)); #calculate cohen's kappa for the first round matches A.raters<-table("Rater 1"=A$R1,"Rater 2"=A$R2); A.rateragree<-sum(diag((A.raters)/nrow(A))); A.r12<-A[,c('R1','R2')]; A.kappa<-kappa2(A.r12); B.raters<-table("Rater 1"=B$R1,"Rater 2"=B$R2); B.rateragree<-sum(diag((B.raters)/nrow(B))); B.r12<-B[,c('R1','R2')]; B.kappa<-kappa2(B.r12); #table the frequencies of categories A.table<-table(A$final); B.table<-table(B$final); chi.cat<-chisq.test(rbind(A.table,B.table)); #find number of authors and number of tweets per author A.authortable<-table(A$actor__postedTime); A.nauthors<-length(A.authortable); A.nsent<-summary(as.numeric(A.authortable)); B.authortable<-table(B$actor__postedTime); B.nauthors<-length(B.authortable); B.nsent<-summary(as.numeric(B.authortable)); #retweets A.nrtweet<-sum(A$RT); A.prtweet<-100*(sum(A$RT)/length(A$RT)); B.nrtweet<-sum(B$RT); B.prtweet<-100*(sum(B$RT)/length(B$RT)); #searching for keywords A.n118<-sum(grepl('118',A$actor__postedTime,ignore.case=TRUE)); A.ncr<-sum(grepl('croce rossa',A$actor__postedTime,ignore.case=TRUE)); A.ncv<-sum(grepl('croce viola',A$actor__postedTime,ignore.case=TRUE)); A.npc<-sum(grepl('protezione civile',A$actor__postedTime,ignore.case=TRUE)); A.keysenders<-c(A.n118,A.ncr,A.ncv,A.npc); names(A.keysenders)<-c('118','Croce Rossa','Croce Viola','Protezione Civile'); B.n118<-sum(grepl('118',B$actor__postedTime,ignore.case=TRUE)); B.ncr<-sum(grepl('croce rossa',B$actor__postedTime,ignore.case=TRUE)); B.ncv<-sum(grepl('croce viola',B$actor__postedTime,ignore.case=TRUE)); B.npc<-sum(grepl('protezione civile',B$actor__postedTime,ignore.case=TRUE)); B.keysenders<-c(B.n118,B.ncr,B.ncv,B.npc); names(B.keysenders)<-c('118','Croce Rossa','Croce Viola','Protezione Civile'); A.n800<-sum(grepl('800',A$object__id))+sum(grepl('numero verde',A$object__id,ignore.case=TRUE)); B.n800<-sum(grepl('800',B$text))+sum(grepl('numero verde',B$text,ignore.case=TRUE)); #textmining Create Corpus and TDM/DTM A.tweetcorp<-Corpus(VectorSource(paste(A$object__id,A$actor__postedTime,sep=' ')),readerControl=list(language='it')); A.tweetcorp<-tm_map(A.tweetcorp,stripWhitespace); A.tweetcorp<-tm_map(A.tweetcorp,tolower); A.tweetcorp<-tm_map(A.tweetcorp,removeWords,stopwords('italian')); A.tweetcorp<-tm_map(A.tweetcorp,removePunctuation); m<-as.matrix(TermDocumentMatrix(A.tweetcorp)); v<-sort(rowSums(m),decreasing=TRUE); A.top15<-head(v,15); B.tweetcorp<-Corpus(VectorSource(B$text)); B.tweetcorp<-tm_map(B.tweetcorp,stripWhitespace); B.tweetcorp<-tm_map(B.tweetcorp,tolower); B.tweetcorp<-tm_map(B.tweetcorp,removeWords,stopwords('italian')); B.tweetcorp<-tm_map(B.tweetcorp,removePunctuation); m2<-as.matrix(TermDocumentMatrix(B.tweetcorp)); v2<-sort(rowSums(m2),decreasing=TRUE); B.top15<-head(v2,15); #timeline of tweets tweettimeformat<-"%Y-%m-%dT%H:%M:%S.000Z"; A$tweettime<-strptime(A$generator__link,format=tweettimeformat); #do machine learning if requested if(mlclass==TRUE){ #split data to training and test sets mt<-t(m); train.size<-floor(ptrain*nrow(mt)); set.seed(seed); train.ind<-sample(seq_len(nrow(mt)),size=train.size); A.train<-mt[train.ind,]; A.test<-mt[-train.ind,]; A.traincl<-A$final[train.ind]; A.testcl<-A$final[-train.ind]; A.traintext<-as.character(A.tweetcorp[train.ind]);#transformed text A.testtext<-as.character(A.tweetcorp[-train.ind]);#test corpus #text mining for classification: knn A.knn<-knn(train=A.train,test=A.test,cl=A.traincl); A.knn.table<-table('final'=A.testcl,'knn'=A.knn); A.knn.crossagree<-sum(diag(A.knn.table)/nrow(A.knntest)); #text mining for classification: ksvm A.ksvm<-ksvm(x=A.train,y=A.traincl,scaled=FALSE); A.ksvmpredict<-predict(A.ksvm,newdata=A.test); A.ksvm.table<-table('final'=A.testcl,'ksvm'=A.ksvmpredict); A.ksvm.crossagree<-sum(diag(A.ksvm.table)/nrow(A.test)); #text mining for classification: sksvm stringkern<-stringdot(type='string'); A.sksvm<-ksvm(x=A.traintext,y=A.traincl,scaled=FALSE,kernel=stringkern); A.sksvmpredict<-predict(A.sksvm,newdata=A.testtext); A.sksvm.table<-table('final'=A.testcl,'String Kernel SV'=A.sksvmpredict); A.sksvm.crossagree=sum(diag(A.sksvm.table)/nrow(A.test)); #text mining for clustering: specc A.speccpredict<-specc(x=as.list(A.testtext),centers=5,kernel=stringkern); A.specc.table<-table('final'=A.testcl,'specc'=A.speccpredict); A.specc.crossagree<-sum(diag(A.specc.table)/length(A.testcl)); #proportion test for the three machine learnings #corrected using holm method #Holm S. 1979. Scand J Stat ml.proptest<-prop.test(x=c(sum(diag(A.knn.table)),sum(diag(A.ksvm.table)),sum(diag(A.sksvm.table))),n=rep(length(A.testcl),times=3)); ml.proptest.adjust<-pairwise.prop.test(x=c(sum(diag(A.knn.table)),sum(diag(A.ksvm.table)),sum(diag(A.sksvm.table))),n=rep(length(A.testcl),times=3),p.adjust.method='holm'); #top 15 for each of the 5 clusters #oh no!!!! the dreaded LOOP in R...... A.specc.top15<-vector(mode='list',length=5);#create empty list for(i in 1:5){ A.specc.top15[[i]]<-head(sort(rowSums(as.matrix(TermDocumentMatrix(A.tweetcorp[A.speccpredict==i]))),decreasing=TRUE),15); } }else{ A.knn.table=FALSE; A.knn.crossagree=FALSE; A.ksvm.table=FALSE; A.ksvm.crossagree=FALSE; A.sksvm.table=FALSE; A.sksvm.crossagree=FALSE; ml.proptest=FALSE; ml.proptest.adjust=FALSE; A.specc.table=FALSE; A.specc.crossagree=FALSE; A.specc.top15=FALSE; } #PLOTS PLOTS PLOTS if(toplot=='bar'){ prop.mat<-100*rbind(prop.table(A.table),prop.table(B.table)); barplot(prop.mat,beside=TRUE,main='Distribution of Tweets by Category',xlab='Category',ylab='Percentage of Tweets',ylim=c(0,50),col=c(17,24));legend(x=7.8,y=49.8,legend=c('Emilia','Abruzzo'),pch=15,col=c(17,24)); }else if(toplot=='crossagree') { barplot(c('knn'=A.knn.crossagree,'ksvm'=A.ksvm.crossagree,'sksvm'=A.sksvm.crossagree),main='Crossagreement of Tweets by Machine Learning',ylab='Proportion Crossagreement',sub='Machine Classification Method',ylim=c(0,1)); abline(0.81,0,lty=2); legend(x=1.5,y=1,lty=2,legend='Reviewer Crossagreement'); }else if(toplot=='hist') { hist(sort(A$tweettime)[-c(1:4)],breaks='mins',main='Distribution of Tweets by Time',ylab='Number of Tweets per Minute',xlab='Time',freq=TRUE); } ##end of the plots execution.time<-proc.time()-pt #returns return(list( A.ntweets=A.ntweets, B.ntweets=B.ntweets, A.raters=A.raters, A.rateragree=A.rateragree, A.mismatches=A.mismatches, A.kappa=A.kappa, B.raters=B.raters, B.rateragree=B.rateragree, B.mismatches=B.mismatches, B.kappa=B.kappa, A.table=A.table, B.table=B.table, chi.cat=chi.cat, A.nauthors=A.nauthors, B.nauthors=B.nauthors, A.nsent=A.nsent, B.nsent=B.nsent, A.nrtweet=A.nrtweet, A.prtweet=A.prtweet, B.nrtweet=B.nrtweet, B.prtweet=B.prtweet, A.keysenders=A.keysenders, B.keysenders=B.keysenders, A.n800=A.n800, B.n800=B.n800, A.top15=A.top15, B.top15=B.top15, A.knn.table=A.knn.table, A.knn.crossagree=A.knn.crossagree, A.ksvm.table=A.ksvm.table, A.ksvm.crossagree=A.ksvm.crossagree, A.sksvm.table=A.sksvm.table, A.sksvm.crossagree=A.sksvm.crossagree, ml.proptest=ml.proptest, ml.proptest.adjust=ml.proptest.adjust, A.specc.table=A.specc.table, A.specc.crossagree=A.specc.crossagree, A.specc.top15=A.specc.top15, execution.time=execution.time )); }