###################################
# Transcript R Workshop 29/03/2017 
###################################
# A Buerki
#
##### Section 1: Basics
"Welcome!"
# you can run a command from the transcript by clicking on the 'Run' button just above
# this text area. Everything in this transcript that does NOT have a '#' at the 
# beginning of the line is a command.

# In the main, we can do 4 types of things in R:
# 1) create objects and delete objects
# 2) display objects and parts of objects
# 3) copy objects and parts of objects
# 4) run functions on objects and parts of objects

## creating objects
# Let's create an object of some numbers, let's say they are ages.
# In R, this type of object is called a vector. You can think of it as a variable.
AGE<-c(37,24,26,30,46)
# we use the c() function for combining/collating values
# you can see that these objects now appear in the top right pane (in the 'Environment').

# Let's create a variable with genders
c("m","f","m","f","f")->GENDER
# this creates a variable with genders (you must use quotation marks for values that 
# are not numbers)
# now this is really a cateorical variable rather than just a string of characters,
# so we'd change its status like this
factor(GENDER)->GENDER

# Now, let's create a data frame inside of R
# 1) assemble vectors of variables (that is columns in a data frame)
# to our AGE and GENDER variables, we want to add a variable containing scores:
c(8,6,5,10,7)->SCORES
# 2) now we make those vectors into a data frame like this:
our.data<-data.frame(GENDER, AGE, SCORES)

## display objects
# we can look at it in different ways
our.data
View(our.data)
# or we can edit it
fix(our.data)
# add another female person, aged 50 with a score of 8 and close.
# we can also look at individual variables in the data frame and their values
our.data$AGE
our.data$GENDER
# or we can pick out values as well
our.data$AGE[1] # the first value in the AGE variable in the data frame our.data
our.data$AGE[c(1,4)] # the first and fourth value of AGE
our.data$AGE[c(1:3)] # values 1 to 3 of the AGE variable
our.data$AGE[our.data$GENDER == "m"] # ages for the male people in our data

## copying objects
# it's not a bad idea to make a backup of our data frame. we just copy it
our.data->backup.of.our.data
# or we can copy part of it
our.data$AGE->more.ages
# To add a new column, we just tell R what data to put where, e.g.
c("English","French","Welsh","Welsh","Chinese")->our.data$L1  # we put it into a column called L1

## importing and exporting data
# the other way is to create the data frame in a different programme (for example a word processor or
# spreadsheet like Excel) and then import it. In this case, the file to be imported is best saved as either
# .txt or .csv . For example, we can import a csv file like this:
# click on the 'Import Dataset' button (in the upper right pane under 'Environment')
# type in a URL or click 'Browse' to navigate to a local file.

# Let's import these data set:
# https://goo.gl/bn2TXp (name: test_scores)
read.csv("https://goo.gl/bn2TXp")->test_scores

# to export our data frame, we use the function write.csv():
write.csv(our.data, file="test_scores.csv",row.names=F)
# the first argument is the name of the data frame, the second is the name of the file we want it to be.
# This will save the file in the project directory. To re-import it, we click 'Import Dataset' and then
# choose 'From Text File'.

# we can get a nice description of our data frames using these functions on objects:
summary(our.data) # shows a summary
# now you can see that GENDER is a factor type variable with two levels (m and f)
# so a factor variable is the same as a categorical/nominal variable.
# The L1 variable is just characters, so we'd need to convert it to a factor:
factor(our.data$L1)->our.data$L1
# now it's ok:
summary(our.data)
# it's more interesting with a larger data set:
summary(test_scores)
# You'll notice that the IDs are coded as numeric which is misleading, so
# we'll change them to characters:
as.character(test_scores$ID)->test_scores$ID
summary(test_scores)


##### Section 2: Descriptive Statistics I

## SUMMARISING

## Central tendency
## the mean
# calculating the average (mean) age
mean(our.data$AGE)
# which is of course the same thing as this:
sum(our.data$AGE)/length(our.data$AGE)

## the median
# calculating the median age
median(our.data$AGE)
# to see if that's correct, we sort ages and we can then count to half:
sort(our.data$AGE)
# Since there are 5 values, the middle value is the third value. If there is an
# even number of values, there are two values that are the middle and then
# one would take the mean of those two values as the median.

## the mode
# a good way to pick out the mode is to make a table and then sort it by frequency:
sort(table(our.data$SCORES)) # the rightmost value(s) is/are the mode(s)
# of course, we can also establish the mode for nominal variables like 'm'/'f':
sort(table(our.data$GENDER))
# so the mode of GENDER is the gender that occurs most often ('f' as it happens in our data)
# the mode is NOT the frequency with which it occurs, but the value which is here 'f'

## normal distribution
# In a normal distribution, the mean, median and mode are the same.
# The distribution of ages in our sample is not normal. We can also confirm this by mapping
# out the distribution. This is basically the same as a histogram, but the lines are heavily
# smoothed out.
# We can plot a nice picture of the distribution like this:
plot( density(test_scores$AGE),xlab = "ages",yaxt="n",ylab="frequency",main="distribution of ages")
# As I mentioned, it's a smoothed out picture of a histogram, basically:
hist(test_scores$AGE,breaks = 7,xlab="Ages",main="Histogram of AGE variable")
# Normal distributions are quite rare in linguistic data, so they are not really 'normal' for us.
# There is also a test for normality: it's called the Shapiro-Wilk test and if it comes out as significant
# it means that the distribution differs significantly from a normal distribution which is bad news if you
# need your data to be normally distriubuted. If this test comes out non-significant, it is typically
# considered safe to pretend that the data are approximately normally distributed.
shapiro.test(test_scores$AGE)
# now this comes out as not significant, meaning that the distribution of values does not significantly
# differ from a normal distribution.

### Dispersion
## Range
range(our.data$AGE) # shows lowest and highest values
diff(range(our.data$AGE)) # calculates difference between lowest and highest = range

## Interquartile range
# let's first create some ficticious data
PURE.FICTION<-c(1:100)
# take a look
PURE.FICTION # this is a vector with numbers 1 to 100
# now let's look at the quartiles
quantile(PURE.FICTION) # note the function's name is quaNtile, not quaRtile
# so the inter-quantile range of PURE.FICTION, accordingly, is 50 (from 25 to 75)
# let's look at more realistic data:
quantile(test_scores$OVERALL)
# you can now read off the interquartile range or calculate it using R
quantile(test_scores$OVERALL)[4]-quantile(test_scores$OVERALL, type=1)[2]
# ignore the percetage number in the result.

## Standard deviation
# very easy, this one:
sd(test_scores$AGE)

## Variance
# also very easy:
var(test_scores$AGE)

## TABLES
# Frequency tables (how often the values of a variable occur) can be done like this
table(our.data$GENDER) # to see how many 'f' and 'm' s there are in the data
# Frequency tables make the most sense for categorical/nominal variables and possibly ordinal
# ones, but we could also check to see how many people of each age there are in our data
table(our.data$AGE) # to see how many of each age
# if you prefer to see proportions, do this
prop.table(table(our.data$GENDER))
# We can also cross-tabluate easily.
# This is just a variation on the frequency table using two variables (or more):
table(our.data$GENDER, our.data$L1)
# This gives us a table showing the frequencies across L1s and GENDER
# Again, this makes the most sense with categorical variables and we can have proportions:
prop.table(table(our.data$GENDER, our.data$L1))


#### Section 3: Figures

## Scatterplot
# For this we will work with the 'Alcohol' data set: it contains scores in an L2 speaking test and 
# the amount of alcohol (in sips from a pint of CWRW) that each test taker took before the
# speaking test (naturally I made up these data). 
read.csv("https://goo.gl/pwBQnJ")->Alcohol
#Let's see how the scores and amount of alcohol relate:
plot(Alcohol$ALCO,Alcohol$SCORE)
# we can make the plot look nicer by
# 1) setting the x and y axes properly, i.e. having them start from 1.
# We do this using 'xlim' for the limits of the x-axis (the limits are 1 and 20), 
# and the same for the y-axis using 'ylim':
plot(Alcohol$ALCO,Alcohol$SCORE, xlim=c(1,20), ylim=c(1,30))
# this produces some error messages because it's a quick and dirty way of forcing x and y axis labels,
# but the result is OK, no need to worry.
# 2) adding an overall title (main) and better descriptions of the x and y axes (xlab for x-axis label and ylab for y-axis label)
plot(Alcohol$ALCO,Alcohol$SCORE, xlim=c(1,20), ylim=c(1,30), main="Alcohol and L2 Speaking Tests", xlab="Amount of alcohol", ylab="L2 speaking test score")
# we could also change the dots like this
plot(Alcohol$ALCO,Alcohol$SCORE, xlim=c(1,20), ylim=c(1,30), main="Alcohol and L2 Speaking Tests", xlab="Amount of alcohol", ylab="L2 speaking test score",pch=6)
# now we could see if there are differences in age groups
# so we add an age-group variable:
Alcohol$AGE.GRP<-c("young","young","middle-aged","mature","middle-aged","middle-aged","middle-aged","mature","mature","mature","young","mature","mature","middle-aged","young","mature","middle-aged","middle-aged","young","mature","young","mature","young")
# so we plot just the young first
plot(Alcohol$ALCO[Alcohol$AGE.GRP == "young"],Alcohol$SCORE[Alcohol$AGE.GRP == "young"], xlim=c(1,20), ylim=c(1,30), main="Alcohol and L2 Speaking Tests", xlab="Amount of alcohol", ylab="L2 speaking test score",pch=6)
# now we add the middle-aged
points(Alcohol$ALCO[Alcohol$AGE.GRP == "middle-aged"],Alcohol$SCORE[Alcohol$AGE.GRP == "middle-aged"], xlim=c(1,20), ylim=c(1,30),pch=15)
# and the mature
points(Alcohol$ALCO[Alcohol$AGE.GRP == "mature"],Alcohol$SCORE[Alcohol$AGE.GRP == "mature"], xlim=c(1,20), ylim=c(1,30),pch=10)
# add we need a legend
legend("bottomright",pch=c(6,15,10),c("young","middle-aged","mature"))

## Barplot
# a barchart can be created like this (also on the basis of a frequency table)
barplot(table(our.data$GENDER))
# again, we can annotate and we can change colours by using 'col':
barplot(table(our.data$GENDER), xlab="Gender", ylab="Frequency", main="Genders in the Data", col=c("grey20", "grey60"))
# now the y-axis tick marks don't look so nice, we'd just want them to show full years:
barplot(table(our.data$GENDER), xlab="Gender", ylab="Frequency", main="Genders in the Data", col=c("grey20", "grey60"),yaxp=c(0,3,3))


# now let's have make this a bit more sophisticated
# we can show the frequencies for a cross-table like this:
barplot(table(test_scores$GENDER, test_scores$L1), beside=T, legend=c("female","male"))
# if we leave out 'beside=T', it will be a stacked bar plot with frequencies of 'f' and 'm' stacked on top of each other:
barplot(table(test_scores$GENDER, test_scores$L1), legend=rownames(table(our.data$GENDER, our.data$L1)))
# for more options see help
help(barplot)

## Line Graph
# Line graphs are typically for mapping the development of something across time
# in test_scores, we don't have time, but we have ages, so if we order the data frame by age
# we could plot the development of a variable across different ages (as an example)
test_scores[order(test_scores$AGE),] -> test_scores_ordered_by_age
plot(type="l",test_scores_ordered_by_age$AGE,test_scores_ordered_by_age$SP, xlab="ages", ylab="test scores", main="Test scores across ages")
# again, we might want to adjust the y-axis to go from 1 to 9 and show each full score in the tick marks
plot(type="l",test_scores_ordered_by_age$AGE,test_scores_ordered_by_age$SP, xlab="ages", ylab="test scores", main="Test scores across ages",ylim = c(1,9),yaxp=c(0,9,9))

# Here is another example
# for example, let's say we have two people's vocabulary test scores, one test was in January, another test in 
# July and one in December.
# the test scores for January were 6.5 (for person 1) and 7.5 (for person 2)
# the test scores for July were 7.0 (for person 1) and 7.0 (for person 2)
# the test scores for December were 7.5 (for person 1) and 8.0 (for person 2)
# Now we draw a the line graph showing the scores for each person
# When we draw the first person's scores, we need to think about what we want the y-axis to look like. Let's say we want it to show 1 to 9, because
# those were the possible test scores.
# Now to draw this plot we need the first person's values, which are
c(5.5,6.5,7.5)
# There are 3 tests so for the x-axis we use 
c(1,2,3)
# now we're ready to plot
plot(type="b", c(1,2,3), c(5.5,6.5,7.5), ylim=c(1,9))
# I used type="b" to say we want both dots and lines
# ylim=c(1,9) was to say we want the y-axis to stretch from 1 to 9
# you can see that the x-axis looks bad. So we first plot without the x-axis and will add it later.
# to do that, we include xaxt="n", and we like nicer labels, so include those as well
plot(type="b", c(1,2,3), c(5.5,6.5,7.5), ylim=c(1,9), xaxt="n", xlab="Test", ylab="Score")
# now we add the axis
axis(1, at=c(1,2,3), labels=c("January","July","December"))
# now we want to add the scores of the second person. Those scores are
c(7.5,7.0,8.0)
# to add this person's scores to our existing graph we use 'lines()' instead of 'plot()' but the rest is the same.
# Almost the same. We want to make the line look different so we use 
# 'pch=2' to change the point character
# 'lty=3' to change the line type
# 'lwd=2' to change the weight of the line
lines(type="b", c(1,2,3), c(7.5,7.0,8.0),pch=2, lty=3, lwd=2)
# we can add a legend, too:
legend(x="right", legend=c("Person 1", "Person 2"), pch=c(1,2))

# Now let's say we want to plot the mean scores of Person 1 and Person 2.
# let's put the values for January into a variable JAN, those for July in JUL 
# and those for December into a variable DEC
c(5.5,7.5)->JAN
c(5.5,7.0)->JUL
c(7.5,8.0)->DEC
# and we plot by using the means of JAN JUL and DEC
plot(type="b", c(1,2,3), c(mean(JAN),mean(JUL),mean(DEC)))
# again, we can make things a bit nicer by labeling and adjusting the axes:
plot(type="b", c(1,2,3), c(mean(JAN),mean(JUL),mean(DEC)), ylim=c(1,9), xaxt="n", xlab="Tests", ylab="Scores", main="Means of test scores")
# we add the x-axis label
axis(1, at=c(1,2,3), labels=c("January","July","December"))

## Pie Chart
# pie charts, for a categorical variable, can be done like this:
pie(table(our.data$GENDER))
# I recommend using grey tones in academic publications, and we might like nicer labels:
pie(table(our.data$GENDER), col=c("white", "grey20"), labels=c("female","male"))
# add another plot in the same window we can do this
par(mfcol=c(2,1));
# that's 2 down, 1 along
pie(table(our.data$L1), col=c("white", "grey20", "grey60","grey75"),main="L1s")
pie(table(our.data$GENDER), col=c("white", "grey20"), labels=c("female","male"),main="genders")
# reset it before we go on
par(mfcol=c(1,1)); 

## Boxplot
# And here is a boxplot (to use for one or more interval/ratio variables):
boxplot(test_scores$AGE, ylab="AGE")
# now let's say we want to compare the ages of male and female subjects using a box plot.
# For this we need the age values for 'M' and 'F' separately. We can them like this:
test_scores$AGE[test_scores$GENDER == "F"] # that is: we want the AGE, but only for cases were GENDER is "f"
test_scores$AGE[test_scores$GENDER == "M"] # that is: we want the AGE, but only for cases were GENDER is "m"
# now we can supply those values to boxplot
boxplot(test_scores$AGE[test_scores$GENDER == "F"],test_scores$AGE[test_scores$GENDER == "M"])
# an easier way is to use a formula
boxplot(AGE~GENDER, test_scores)
# and we can label things more neatly like this
boxplot(AGE~GENDER, test_scores,xlab="GENDER", ylab="AGE", main="Boxplot of male and female ages")
# if we want, we can add a little '+' where the mean is
text(1:2, c(mean(test_scores$AGE[test_scores$GENDER == "F"]), mean(test_scores$AGE[test_scores$GENDER == "M"])), c("+","+"))
# To make it easier to tell whether there is a significant difference between males and
# females, we can use notches: if notches overlap, it's unlikely a significant difference:
boxplot(AGE~GENDER, test_scores,xlab="GENDER", ylab="AGE", main="Boxplot of male and female ages",notch=T)
# There are more complicated things we can do with box plots. We could look at
# how AGE id different by GENDER and L1:
boxplot(AGE~GENDER+L1, test_scores,notch=T)
# F.E there means females with a European language as their L1, M.N is males with a non-
# European language as their L1.
# We could also overlap these: first we plot the scores of females:
boxplot(VOCAB~L1,test_scores,col="gray",subset = GENDER=="F",boxwex=0.9)
# then we add the plots for males and make them a bit narrower and white
boxplot(VOCAB~L1,test_scores,col="white",subset = GENDER=="M",boxwex=0.6,add=T)

## Interaction plot
# This type of plot can take two categorical variables and one interval/ratio variable.
# It shows how the three variables interact. For example:
interaction.plot(test_scores$L1,test_scores$GENDER,test_scores$OVERALL,xlab="L1", ylab="Scores",trace.label = "Gender",ylim=c(0,10))
# So we can see that scores differ by gender, but interestingly, among L1-speakers of E (= English)
# male people have slightly higher score on average than female people. This is reversed for L1 W (=Welsh)
# speakers and the difference between genders is more pronounced. All of this shown in a single graph. Quite nice.
# Of course I have no idea if this is true outside of these concocted sample data.

## There is lots more to plots in R, look at the references to further reading or the web.