################################### # Transcript R Workshop 29/03/2017 ################################### # A Buerki # ##### Section 1: Basics "Welcome!" # you can run a command from the transcript by clicking on the 'Run' button just above # this text area. Everything in this transcript that does NOT have a '#' at the # beginning of the line is a command. # In the main, we can do 4 types of things in R: # 1) create objects and delete objects # 2) display objects and parts of objects # 3) copy objects and parts of objects # 4) run functions on objects and parts of objects ## creating objects # Let's create an object of some numbers, let's say they are ages. # In R, this type of object is called a vector. You can think of it as a variable. AGE<-c(37,24,26,30,46) # we use the c() function for combining/collating values # you can see that these objects now appear in the top right pane (in the 'Environment'). # Let's create a variable with genders c("m","f","m","f","f")->GENDER # this creates a variable with genders (you must use quotation marks for values that # are not numbers) # now this is really a cateorical variable rather than just a string of characters, # so we'd change its status like this factor(GENDER)->GENDER # Now, let's create a data frame inside of R # 1) assemble vectors of variables (that is columns in a data frame) # to our AGE and GENDER variables, we want to add a variable containing scores: c(8,6,5,10,7)->SCORES # 2) now we make those vectors into a data frame like this: our.data<-data.frame(GENDER, AGE, SCORES) ## display objects # we can look at it in different ways our.data View(our.data) # or we can edit it fix(our.data) # add another female person, aged 50 with a score of 8 and close. # we can also look at individual variables in the data frame and their values our.data$AGE our.data$GENDER # or we can pick out values as well our.data$AGE[1] # the first value in the AGE variable in the data frame our.data our.data$AGE[c(1,4)] # the first and fourth value of AGE our.data$AGE[c(1:3)] # values 1 to 3 of the AGE variable our.data$AGE[our.data$GENDER == "m"] # ages for the male people in our data ## copying objects # it's not a bad idea to make a backup of our data frame. we just copy it our.data->backup.of.our.data # or we can copy part of it our.data$AGE->more.ages # To add a new column, we just tell R what data to put where, e.g. c("English","French","Welsh","Welsh","Chinese")->our.data$L1 # we put it into a column called L1 ## importing and exporting data # the other way is to create the data frame in a different programme (for example a word processor or # spreadsheet like Excel) and then import it. In this case, the file to be imported is best saved as either # .txt or .csv . For example, we can import a csv file like this: # click on the 'Import Dataset' button (in the upper right pane under 'Environment') # type in a URL or click 'Browse' to navigate to a local file. # Let's import these data set: # https://goo.gl/bn2TXp (name: test_scores) read.csv("https://goo.gl/bn2TXp")->test_scores # to export our data frame, we use the function write.csv(): write.csv(our.data, file="test_scores.csv",row.names=F) # the first argument is the name of the data frame, the second is the name of the file we want it to be. # This will save the file in the project directory. To re-import it, we click 'Import Dataset' and then # choose 'From Text File'. # we can get a nice description of our data frames using these functions on objects: summary(our.data) # shows a summary # now you can see that GENDER is a factor type variable with two levels (m and f) # so a factor variable is the same as a categorical/nominal variable. # The L1 variable is just characters, so we'd need to convert it to a factor: factor(our.data$L1)->our.data$L1 # now it's ok: summary(our.data) # it's more interesting with a larger data set: summary(test_scores) # You'll notice that the IDs are coded as numeric which is misleading, so # we'll change them to characters: as.character(test_scores$ID)->test_scores$ID summary(test_scores) ##### Section 2: Descriptive Statistics I ## SUMMARISING ## Central tendency ## the mean # calculating the average (mean) age mean(our.data$AGE) # which is of course the same thing as this: sum(our.data$AGE)/length(our.data$AGE) ## the median # calculating the median age median(our.data$AGE) # to see if that's correct, we sort ages and we can then count to half: sort(our.data$AGE) # Since there are 5 values, the middle value is the third value. If there is an # even number of values, there are two values that are the middle and then # one would take the mean of those two values as the median. ## the mode # a good way to pick out the mode is to make a table and then sort it by frequency: sort(table(our.data$SCORES)) # the rightmost value(s) is/are the mode(s) # of course, we can also establish the mode for nominal variables like 'm'/'f': sort(table(our.data$GENDER)) # so the mode of GENDER is the gender that occurs most often ('f' as it happens in our data) # the mode is NOT the frequency with which it occurs, but the value which is here 'f' ## normal distribution # In a normal distribution, the mean, median and mode are the same. # The distribution of ages in our sample is not normal. We can also confirm this by mapping # out the distribution. This is basically the same as a histogram, but the lines are heavily # smoothed out. # We can plot a nice picture of the distribution like this: plot( density(test_scores$AGE),xlab = "ages",yaxt="n",ylab="frequency",main="distribution of ages") # As I mentioned, it's a smoothed out picture of a histogram, basically: hist(test_scores$AGE,breaks = 7,xlab="Ages",main="Histogram of AGE variable") # Normal distributions are quite rare in linguistic data, so they are not really 'normal' for us. # There is also a test for normality: it's called the Shapiro-Wilk test and if it comes out as significant # it means that the distribution differs significantly from a normal distribution which is bad news if you # need your data to be normally distriubuted. If this test comes out non-significant, it is typically # considered safe to pretend that the data are approximately normally distributed. shapiro.test(test_scores$AGE) # now this comes out as not significant, meaning that the distribution of values does not significantly # differ from a normal distribution. ### Dispersion ## Range range(our.data$AGE) # shows lowest and highest values diff(range(our.data$AGE)) # calculates difference between lowest and highest = range ## Interquartile range # let's first create some ficticious data PURE.FICTION<-c(1:100) # take a look PURE.FICTION # this is a vector with numbers 1 to 100 # now let's look at the quartiles quantile(PURE.FICTION) # note the function's name is quaNtile, not quaRtile # so the inter-quantile range of PURE.FICTION, accordingly, is 50 (from 25 to 75) # let's look at more realistic data: quantile(test_scores$OVERALL) # you can now read off the interquartile range or calculate it using R quantile(test_scores$OVERALL)[4]-quantile(test_scores$OVERALL, type=1)[2] # ignore the percetage number in the result. ## Standard deviation # very easy, this one: sd(test_scores$AGE) ## Variance # also very easy: var(test_scores$AGE) ## TABLES # Frequency tables (how often the values of a variable occur) can be done like this table(our.data$GENDER) # to see how many 'f' and 'm' s there are in the data # Frequency tables make the most sense for categorical/nominal variables and possibly ordinal # ones, but we could also check to see how many people of each age there are in our data table(our.data$AGE) # to see how many of each age # if you prefer to see proportions, do this prop.table(table(our.data$GENDER)) # We can also cross-tabluate easily. # This is just a variation on the frequency table using two variables (or more): table(our.data$GENDER, our.data$L1) # This gives us a table showing the frequencies across L1s and GENDER # Again, this makes the most sense with categorical variables and we can have proportions: prop.table(table(our.data$GENDER, our.data$L1)) #### Section 3: Figures ## Scatterplot # For this we will work with the 'Alcohol' data set: it contains scores in an L2 speaking test and # the amount of alcohol (in sips from a pint of CWRW) that each test taker took before the # speaking test (naturally I made up these data). read.csv("https://goo.gl/pwBQnJ")->Alcohol #Let's see how the scores and amount of alcohol relate: plot(Alcohol$ALCO,Alcohol$SCORE) # we can make the plot look nicer by # 1) setting the x and y axes properly, i.e. having them start from 1. # We do this using 'xlim' for the limits of the x-axis (the limits are 1 and 20), # and the same for the y-axis using 'ylim': plot(Alcohol$ALCO,Alcohol$SCORE, xlim=c(1,20), ylim=c(1,30)) # this produces some error messages because it's a quick and dirty way of forcing x and y axis labels, # but the result is OK, no need to worry. # 2) adding an overall title (main) and better descriptions of the x and y axes (xlab for x-axis label and ylab for y-axis label) plot(Alcohol$ALCO,Alcohol$SCORE, xlim=c(1,20), ylim=c(1,30), main="Alcohol and L2 Speaking Tests", xlab="Amount of alcohol", ylab="L2 speaking test score") # we could also change the dots like this plot(Alcohol$ALCO,Alcohol$SCORE, xlim=c(1,20), ylim=c(1,30), main="Alcohol and L2 Speaking Tests", xlab="Amount of alcohol", ylab="L2 speaking test score",pch=6) # now we could see if there are differences in age groups # so we add an age-group variable: Alcohol$AGE.GRP<-c("young","young","middle-aged","mature","middle-aged","middle-aged","middle-aged","mature","mature","mature","young","mature","mature","middle-aged","young","mature","middle-aged","middle-aged","young","mature","young","mature","young") # so we plot just the young first plot(Alcohol$ALCO[Alcohol$AGE.GRP == "young"],Alcohol$SCORE[Alcohol$AGE.GRP == "young"], xlim=c(1,20), ylim=c(1,30), main="Alcohol and L2 Speaking Tests", xlab="Amount of alcohol", ylab="L2 speaking test score",pch=6) # now we add the middle-aged points(Alcohol$ALCO[Alcohol$AGE.GRP == "middle-aged"],Alcohol$SCORE[Alcohol$AGE.GRP == "middle-aged"], xlim=c(1,20), ylim=c(1,30),pch=15) # and the mature points(Alcohol$ALCO[Alcohol$AGE.GRP == "mature"],Alcohol$SCORE[Alcohol$AGE.GRP == "mature"], xlim=c(1,20), ylim=c(1,30),pch=10) # add we need a legend legend("bottomright",pch=c(6,15,10),c("young","middle-aged","mature")) ## Barplot # a barchart can be created like this (also on the basis of a frequency table) barplot(table(our.data$GENDER)) # again, we can annotate and we can change colours by using 'col': barplot(table(our.data$GENDER), xlab="Gender", ylab="Frequency", main="Genders in the Data", col=c("grey20", "grey60")) # now the y-axis tick marks don't look so nice, we'd just want them to show full years: barplot(table(our.data$GENDER), xlab="Gender", ylab="Frequency", main="Genders in the Data", col=c("grey20", "grey60"),yaxp=c(0,3,3)) # now let's have make this a bit more sophisticated # we can show the frequencies for a cross-table like this: barplot(table(test_scores$GENDER, test_scores$L1), beside=T, legend=c("female","male")) # if we leave out 'beside=T', it will be a stacked bar plot with frequencies of 'f' and 'm' stacked on top of each other: barplot(table(test_scores$GENDER, test_scores$L1), legend=rownames(table(our.data$GENDER, our.data$L1))) # for more options see help help(barplot) ## Line Graph # Line graphs are typically for mapping the development of something across time # in test_scores, we don't have time, but we have ages, so if we order the data frame by age # we could plot the development of a variable across different ages (as an example) test_scores[order(test_scores$AGE),] -> test_scores_ordered_by_age plot(type="l",test_scores_ordered_by_age$AGE,test_scores_ordered_by_age$SP, xlab="ages", ylab="test scores", main="Test scores across ages") # again, we might want to adjust the y-axis to go from 1 to 9 and show each full score in the tick marks plot(type="l",test_scores_ordered_by_age$AGE,test_scores_ordered_by_age$SP, xlab="ages", ylab="test scores", main="Test scores across ages",ylim = c(1,9),yaxp=c(0,9,9)) # Here is another example # for example, let's say we have two people's vocabulary test scores, one test was in January, another test in # July and one in December. # the test scores for January were 6.5 (for person 1) and 7.5 (for person 2) # the test scores for July were 7.0 (for person 1) and 7.0 (for person 2) # the test scores for December were 7.5 (for person 1) and 8.0 (for person 2) # Now we draw a the line graph showing the scores for each person # When we draw the first person's scores, we need to think about what we want the y-axis to look like. Let's say we want it to show 1 to 9, because # those were the possible test scores. # Now to draw this plot we need the first person's values, which are c(5.5,6.5,7.5) # There are 3 tests so for the x-axis we use c(1,2,3) # now we're ready to plot plot(type="b", c(1,2,3), c(5.5,6.5,7.5), ylim=c(1,9)) # I used type="b" to say we want both dots and lines # ylim=c(1,9) was to say we want the y-axis to stretch from 1 to 9 # you can see that the x-axis looks bad. So we first plot without the x-axis and will add it later. # to do that, we include xaxt="n", and we like nicer labels, so include those as well plot(type="b", c(1,2,3), c(5.5,6.5,7.5), ylim=c(1,9), xaxt="n", xlab="Test", ylab="Score") # now we add the axis axis(1, at=c(1,2,3), labels=c("January","July","December")) # now we want to add the scores of the second person. Those scores are c(7.5,7.0,8.0) # to add this person's scores to our existing graph we use 'lines()' instead of 'plot()' but the rest is the same. # Almost the same. We want to make the line look different so we use # 'pch=2' to change the point character # 'lty=3' to change the line type # 'lwd=2' to change the weight of the line lines(type="b", c(1,2,3), c(7.5,7.0,8.0),pch=2, lty=3, lwd=2) # we can add a legend, too: legend(x="right", legend=c("Person 1", "Person 2"), pch=c(1,2)) # Now let's say we want to plot the mean scores of Person 1 and Person 2. # let's put the values for January into a variable JAN, those for July in JUL # and those for December into a variable DEC c(5.5,7.5)->JAN c(5.5,7.0)->JUL c(7.5,8.0)->DEC # and we plot by using the means of JAN JUL and DEC plot(type="b", c(1,2,3), c(mean(JAN),mean(JUL),mean(DEC))) # again, we can make things a bit nicer by labeling and adjusting the axes: plot(type="b", c(1,2,3), c(mean(JAN),mean(JUL),mean(DEC)), ylim=c(1,9), xaxt="n", xlab="Tests", ylab="Scores", main="Means of test scores") # we add the x-axis label axis(1, at=c(1,2,3), labels=c("January","July","December")) ## Pie Chart # pie charts, for a categorical variable, can be done like this: pie(table(our.data$GENDER)) # I recommend using grey tones in academic publications, and we might like nicer labels: pie(table(our.data$GENDER), col=c("white", "grey20"), labels=c("female","male")) # add another plot in the same window we can do this par(mfcol=c(2,1)); # that's 2 down, 1 along pie(table(our.data$L1), col=c("white", "grey20", "grey60","grey75"),main="L1s") pie(table(our.data$GENDER), col=c("white", "grey20"), labels=c("female","male"),main="genders") # reset it before we go on par(mfcol=c(1,1)); ## Boxplot # And here is a boxplot (to use for one or more interval/ratio variables): boxplot(test_scores$AGE, ylab="AGE") # now let's say we want to compare the ages of male and female subjects using a box plot. # For this we need the age values for 'M' and 'F' separately. We can them like this: test_scores$AGE[test_scores$GENDER == "F"] # that is: we want the AGE, but only for cases were GENDER is "f" test_scores$AGE[test_scores$GENDER == "M"] # that is: we want the AGE, but only for cases were GENDER is "m" # now we can supply those values to boxplot boxplot(test_scores$AGE[test_scores$GENDER == "F"],test_scores$AGE[test_scores$GENDER == "M"]) # an easier way is to use a formula boxplot(AGE~GENDER, test_scores) # and we can label things more neatly like this boxplot(AGE~GENDER, test_scores,xlab="GENDER", ylab="AGE", main="Boxplot of male and female ages") # if we want, we can add a little '+' where the mean is text(1:2, c(mean(test_scores$AGE[test_scores$GENDER == "F"]), mean(test_scores$AGE[test_scores$GENDER == "M"])), c("+","+")) # To make it easier to tell whether there is a significant difference between males and # females, we can use notches: if notches overlap, it's unlikely a significant difference: boxplot(AGE~GENDER, test_scores,xlab="GENDER", ylab="AGE", main="Boxplot of male and female ages",notch=T) # There are more complicated things we can do with box plots. We could look at # how AGE id different by GENDER and L1: boxplot(AGE~GENDER+L1, test_scores,notch=T) # F.E there means females with a European language as their L1, M.N is males with a non- # European language as their L1. # We could also overlap these: first we plot the scores of females: boxplot(VOCAB~L1,test_scores,col="gray",subset = GENDER=="F",boxwex=0.9) # then we add the plots for males and make them a bit narrower and white boxplot(VOCAB~L1,test_scores,col="white",subset = GENDER=="M",boxwex=0.6,add=T) ## Interaction plot # This type of plot can take two categorical variables and one interval/ratio variable. # It shows how the three variables interact. For example: interaction.plot(test_scores$L1,test_scores$GENDER,test_scores$OVERALL,xlab="L1", ylab="Scores",trace.label = "Gender",ylim=c(0,10)) # So we can see that scores differ by gender, but interestingly, among L1-speakers of E (= English) # male people have slightly higher score on average than female people. This is reversed for L1 W (=Welsh) # speakers and the difference between genders is more pronounced. All of this shown in a single graph. Quite nice. # Of course I have no idea if this is true outside of these concocted sample data. ## There is lots more to plots in R, look at the references to further reading or the web.