# Corpus Linguistics with R, Day 1

July 28, 2009
By

(This article was first published on Cornelius Puschmann's Blog » R, and kindly contributed to R-bloggers)

(This post documents the first day of a class on R that I took at ESU C&T. I is posted here purely for my own use.)

 R Lesson 1

 > 2+3; 2/3; 2^3 [1] 5 [1] 0.6666667 [1] 8 --- Fundamentals - Functions > log(x=1000, base=10) [1] 3 --- (Formals describes the syntax of other functions) formals(sample) --- Variables ( <- allows you to save something in a data structure (variable) ) > a<-2+3 > a [1] 5 # is for comments whitespace doesn't matter --- # Pick files file.choose() # Get working dir getwd() # Set working dir setwd("..") # Save > save(VARIABLE_NAME, file=file.choose()) Fehler in save(test, file = file.choose()) : Objekt ‘test’ nicht gefunden > save.image("FILE_NAME") --- > setwd("/home/cornelius/Code/samples/Brown_95perc") > getwd() [1] "/home/cornelius/Code/samples/Brown_95perc" > dir() > my_array <- c(1,2,3,4) > my_array [1] 1 2 3 4 > my_array <- c("lalala", "lululu", "bla") > my_array2 <- c(1,2,3,4) > c(my_array, my_array2) [1] "lalala" "lululu" "bla" "1" "2" "3" "4" > # it is possible to add something to ALL values in a vector, i.e. my_array2 + 10 # c (conc) makes a list stuff1<-c(1,2,3,4,5) --- # sequence starts at 1 (first arg), goes on for 5 (second arg), increments by 1 (third arg) seq(1, 5, 1) --- # put a file into a corpus vector # what=real|char sep=seperator > my_corpus<-scan(file=file.choose(), what="char", sep="\n") # unique elements in my array unique(array) # count elements in an array table(array) # sort elements in an array sort(table(array)) --- # this tells me the position of the elements in my text that aren't "this" > values<-which(my_little_corpus!="this") > values [1] 2 3 4 5 6 7 8 9 11 12 13 14 # this will produce TRUE|FALSE for my condition (is this element "this") > values<-my_little_corpus!="this" > values [1] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE [13] TRUE TRUE # this will return the array without "this" > values<-my_little_corpus[my_little_corpus!="this"] > values [1] "is" "just" "a" "little" "example" "bla" "bla" [8] "bla" "is" "the" "third" "line" ... > cc<-c("banana", "bagel") > cc == "banana"; cc!="banana" # [1] TRUE FALSE [1] FALSE TRUE > "banana" %in% cc [1] TRUE > c("bagel", "banana") %in% cc [1] TRUE TRUE > match ("banana", cc) [1] 1 > match (c("bagel","banana"), cc) [1] 2 1 # match looks for a list of tokens and returns their position in the datastructure --- > cat(bb, sep="\n", file=scan(what="char"), append=F) # write the contents of bb to a file, ask the user for file moo<-scan(what="char") # read something the user types into a var # Clear Mem > rm(list=ls(all=T)) > --- # create vector1 (ordered) vec1<-c("a","b","c","d","e","f,",g","h","i","j") # oder # > letters[1:10] # [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" # create vector2 (random) # > vector2<-sample(vector1) --- length() # number of elements nchar() # number of characters > aa<-"know" > nchar(aa) [1] 4 > aa<-c("I","do","not","know") > nchar(aa) [1] 1 2 3 4 > lala<-c("cat","gnu","hippopotamus") > lala [1] "cat" "gnu" "hippopotamus" > nchar(lala) [1] 3 3 12 > substr("hippopotamus", 0, 5) [1] "hippo" > # like explode() / implode() paste (string, sep="my_seperator", collapse="stuff to put in") --- # percentages x/sum(x) barplot (1,2,3) Read in corpus data and build a list of words frequencies 1) scan file 2) strsplit by " " 3) unlist to make vector 4) make a table with freqs 5) sort 6) output #search for strings grep("needle", haystack) > grep("is", text, value=T) [1] "This is a first example sentence." [2] "And this is a second example sentence." > grep("And", text, value=T) [1] "And this is a second example sentence." > grep("sentence", text, value=T) [1] "This is a first example sentence." [2] "And this is a second example sentence." > gregexpr # alternative to grep, returns a list of vectors > mat<-gregexpr("e", text) > mat [[1]] [1] 17 23 26 29 32 attr(,"match.length") [1] 1 1 1 1 1 [[2]] [1] 16 22 28 31 34 37 attr(,"match.length") [1] 1 1 1 1 1 1 

> unlist(mat) [1] 17 23 26 29 32 16 22 28 31 34 37 > mat<-gregexpr("sentence", text) > sapply (mat, c) [1] 25 30