tweet<-function(toplot='bar',mlclass=FALSE,ptrain=.1,seed=3826) {

    #toplot: Which plot?
          # bar - side by side barplot of categories
          # crossagree - plot of crossagreement between machine learning
          # hist - histogram of tweets by time for group A

    #mlclass: If true, do machine learning classification(Processor intensive)

#libraries
    library(irr);
    library(class);
    library(tm);
    library(kernlab);

    pt<-proc.time();

    A<-read.csv('emiliafinal.csv');
    B<-read.csv('abruzzofinal.csv');
    A$match1<-(A$R1==A$R2);
    B$match1<-(B$R1==B$R2);

    A.nomatch<-A[A$match1==FALSE,];
    A.mismatches<-dim(A.nomatch)[1];

    B.nomatch<-B[B$match1==FALSE,];
    B.mismatches<-dim(B.nomatch)[1];

    A.ntweets<-dim(A)[1];
    B.ntweets<-dim(B)[1];


#calculate the final scores
    A.r123<-A[,c('R1','R2','R3')];
    A$final<-as.factor(apply(A.r123,1,FUN=comp3));

    B.r123<-B[,c('R1','R2','R3')];
    B$final<-as.factor(apply(B.r123,1,FUN=comp3));


#calculate cohen's kappa for the first round matches
    A.raters<-table("Rater 1"=A$R1,"Rater 2"=A$R2);
    A.rateragree<-sum(diag((A.raters)/nrow(A)));
    A.r12<-A[,c('R1','R2')];
    A.kappa<-kappa2(A.r12);

    B.raters<-table("Rater 1"=B$R1,"Rater 2"=B$R2);
    B.rateragree<-sum(diag((B.raters)/nrow(B)));

    B.r12<-B[,c('R1','R2')];
    B.kappa<-kappa2(B.r12);

#table the frequencies of categories
    A.table<-table(A$final);
    B.table<-table(B$final);
    chi.cat<-chisq.test(rbind(A.table,B.table));

#find number of authors and number of tweets per author
    A.authortable<-table(A$actor__postedTime);
    A.nauthors<-length(A.authortable);
    A.nsent<-summary(as.numeric(A.authortable));
    B.authortable<-table(B$actor__postedTime);
    B.nauthors<-length(B.authortable);
    B.nsent<-summary(as.numeric(B.authortable));

#retweets
   A.nrtweet<-sum(A$RT);
   A.prtweet<-100*(sum(A$RT)/length(A$RT));
   B.nrtweet<-sum(B$RT);
   B.prtweet<-100*(sum(B$RT)/length(B$RT));


#searching for keywords
    A.n118<-sum(grepl('118',A$actor__postedTime,ignore.case=TRUE));
    A.ncr<-sum(grepl('croce rossa',A$actor__postedTime,ignore.case=TRUE));
    A.ncv<-sum(grepl('croce viola',A$actor__postedTime,ignore.case=TRUE));
    A.npc<-sum(grepl('protezione civile',A$actor__postedTime,ignore.case=TRUE));
    A.keysenders<-c(A.n118,A.ncr,A.ncv,A.npc);
    names(A.keysenders)<-c('118','Croce Rossa','Croce Viola','Protezione Civile');

    B.n118<-sum(grepl('118',B$actor__postedTime,ignore.case=TRUE));
    B.ncr<-sum(grepl('croce rossa',B$actor__postedTime,ignore.case=TRUE));
    B.ncv<-sum(grepl('croce viola',B$actor__postedTime,ignore.case=TRUE));
    B.npc<-sum(grepl('protezione civile',B$actor__postedTime,ignore.case=TRUE));
    B.keysenders<-c(B.n118,B.ncr,B.ncv,B.npc);
    names(B.keysenders)<-c('118','Croce Rossa','Croce Viola','Protezione Civile');
     A.n800<-sum(grepl('800',A$object__id))+sum(grepl('numero verde',A$object__id,ignore.case=TRUE));
    B.n800<-sum(grepl('800',B$text))+sum(grepl('numero verde',B$text,ignore.case=TRUE));


#textmining Create Corpus and TDM/DTM
    A.tweetcorp<-Corpus(VectorSource(paste(A$object__id,A$actor__postedTime,sep=' ')),readerControl=list(language='it'));
    A.tweetcorp<-tm_map(A.tweetcorp,stripWhitespace);
    A.tweetcorp<-tm_map(A.tweetcorp,tolower);
    A.tweetcorp<-tm_map(A.tweetcorp,removeWords,stopwords('italian'));
    A.tweetcorp<-tm_map(A.tweetcorp,removePunctuation);
    m<-as.matrix(TermDocumentMatrix(A.tweetcorp));
    v<-sort(rowSums(m),decreasing=TRUE);
    A.top15<-head(v,15);

    B.tweetcorp<-Corpus(VectorSource(B$text));
    B.tweetcorp<-tm_map(B.tweetcorp,stripWhitespace);
    B.tweetcorp<-tm_map(B.tweetcorp,tolower);
    B.tweetcorp<-tm_map(B.tweetcorp,removeWords,stopwords('italian'));
    B.tweetcorp<-tm_map(B.tweetcorp,removePunctuation);
    m2<-as.matrix(TermDocumentMatrix(B.tweetcorp));
    v2<-sort(rowSums(m2),decreasing=TRUE);
    B.top15<-head(v2,15);

#timeline of tweets
    tweettimeformat<-"%Y-%m-%dT%H:%M:%S.000Z";
    A$tweettime<-strptime(A$generator__link,format=tweettimeformat);

#do machine learning if requested
if(mlclass==TRUE){

#split data to training and test sets
     mt<-t(m);
     train.size<-floor(ptrain*nrow(mt));
     set.seed(seed);
     train.ind<-sample(seq_len(nrow(mt)),size=train.size);
     A.train<-mt[train.ind,];
     A.test<-mt[-train.ind,];
     A.traincl<-A$final[train.ind];
     A.testcl<-A$final[-train.ind];
     A.traintext<-as.character(A.tweetcorp[train.ind]);#transformed text
     A.testtext<-as.character(A.tweetcorp[-train.ind]);#test corpus

#text mining for classification:  knn
    A.knn<-knn(train=A.train,test=A.test,cl=A.traincl);
    A.knn.table<-table('final'=A.testcl,'knn'=A.knn);
    A.knn.crossagree<-sum(diag(A.knn.table)/nrow(A.knntest));

#text mining for classification: ksvm
    A.ksvm<-ksvm(x=A.train,y=A.traincl,scaled=FALSE);
    A.ksvmpredict<-predict(A.ksvm,newdata=A.test);
    A.ksvm.table<-table('final'=A.testcl,'ksvm'=A.ksvmpredict);
    A.ksvm.crossagree<-sum(diag(A.ksvm.table)/nrow(A.test));

#text mining for classification: sksvm
     stringkern<-stringdot(type='string');
     A.sksvm<-ksvm(x=A.traintext,y=A.traincl,scaled=FALSE,kernel=stringkern);
     A.sksvmpredict<-predict(A.sksvm,newdata=A.testtext);
     A.sksvm.table<-table('final'=A.testcl,'String Kernel SV'=A.sksvmpredict);
     A.sksvm.crossagree=sum(diag(A.sksvm.table)/nrow(A.test));

#text mining for clustering: specc
     A.speccpredict<-specc(x=as.list(A.testtext),centers=5,kernel=stringkern);
     A.specc.table<-table('final'=A.testcl,'specc'=A.speccpredict);
     A.specc.crossagree<-sum(diag(A.specc.table)/length(A.testcl));

#proportion test for the three machine learnings
#corrected using holm method
#Holm S. 1979.  Scand J Stat
     ml.proptest<-prop.test(x=c(sum(diag(A.knn.table)),sum(diag(A.ksvm.table)),sum(diag(A.sksvm.table))),n=rep(length(A.testcl),times=3));

     ml.proptest.adjust<-pairwise.prop.test(x=c(sum(diag(A.knn.table)),sum(diag(A.ksvm.table)),sum(diag(A.sksvm.table))),n=rep(length(A.testcl),times=3),p.adjust.method='holm');


#top 15 for each of the 5 clusters
     #oh no!!!! the dreaded LOOP in R......
      A.specc.top15<-vector(mode='list',length=5);#create empty list
     for(i in 1:5){
         A.specc.top15[[i]]<-head(sort(rowSums(as.matrix(TermDocumentMatrix(A.tweetcorp[A.speccpredict==i]))),decreasing=TRUE),15);
     }


}else{
    A.knn.table=FALSE;
    A.knn.crossagree=FALSE;
    A.ksvm.table=FALSE;
    A.ksvm.crossagree=FALSE;
    A.sksvm.table=FALSE;
    A.sksvm.crossagree=FALSE;
    ml.proptest=FALSE;
    ml.proptest.adjust=FALSE;
    A.specc.table=FALSE;
    A.specc.crossagree=FALSE;
    A.specc.top15=FALSE;
}


#PLOTS PLOTS PLOTS
    if(toplot=='bar'){
        prop.mat<-100*rbind(prop.table(A.table),prop.table(B.table));
        barplot(prop.mat,beside=TRUE,main='Distribution of Tweets by Category',xlab='Category',ylab='Percentage of Tweets',ylim=c(0,50),col=c(17,24));legend(x=7.8,y=49.8,legend=c('Emilia','Abruzzo'),pch=15,col=c(17,24));

    }else if(toplot=='crossagree') {
        barplot(c('knn'=A.knn.crossagree,'ksvm'=A.ksvm.crossagree,'sksvm'=A.sksvm.crossagree),main='Crossagreement of Tweets by Machine Learning',ylab='Proportion Crossagreement',sub='Machine Classification Method',ylim=c(0,1));
        abline(0.81,0,lty=2);
        legend(x=1.5,y=1,lty=2,legend='Reviewer Crossagreement');
    }else if(toplot=='hist') {
        hist(sort(A$tweettime)[-c(1:4)],breaks='mins',main='Distribution of Tweets by Time',ylab='Number of Tweets per Minute',xlab='Time',freq=TRUE);
    }

##end of the plots

execution.time<-proc.time()-pt

#returns
    return(list(
        A.ntweets=A.ntweets,
        B.ntweets=B.ntweets,
        A.raters=A.raters,
        A.rateragree=A.rateragree,
        A.mismatches=A.mismatches,
        A.kappa=A.kappa,
        B.raters=B.raters,
        B.rateragree=B.rateragree,
        B.mismatches=B.mismatches,
        B.kappa=B.kappa,
        A.table=A.table,
        B.table=B.table,
        chi.cat=chi.cat,
        A.nauthors=A.nauthors,
        B.nauthors=B.nauthors,
        A.nsent=A.nsent,
        B.nsent=B.nsent,
        A.nrtweet=A.nrtweet,
        A.prtweet=A.prtweet,
        B.nrtweet=B.nrtweet,
        B.prtweet=B.prtweet,
        A.keysenders=A.keysenders,
        B.keysenders=B.keysenders,
        A.n800=A.n800,
        B.n800=B.n800,
        A.top15=A.top15,
        B.top15=B.top15,
        A.knn.table=A.knn.table,
        A.knn.crossagree=A.knn.crossagree,
        A.ksvm.table=A.ksvm.table,
        A.ksvm.crossagree=A.ksvm.crossagree,
        A.sksvm.table=A.sksvm.table,
        A.sksvm.crossagree=A.sksvm.crossagree,
        ml.proptest=ml.proptest,
        ml.proptest.adjust=ml.proptest.adjust,
        A.specc.table=A.specc.table,
        A.specc.crossagree=A.specc.crossagree,
        A.specc.top15=A.specc.top15,
        execution.time=execution.time
        ));
}