#Needed <- c("tm", "SnowballCC", "RColorBrewer", "ggplot2", "wordcloud", "biclust", "cluster", "igraph", "fpc") #install.packages(Needed, dependencies=TRUE) #install.packages("qdap") #library(SnowballC) cname <- file.path("C:", "texts") cname dir(cname) library(tm) docs <- Corpus(DirSource(cname)) inspect(docs) docs <- tm_map(docs, removePunctuation) inspect(docs) for(j in seq(docs)) { docs[[j]] <- gsub("0x22", " ", docs[[j]]) docs[[j]] <- gsub("0x20", " ", docs[[j]]) docs[[j]] <- gsub("0x27", " ", docs[[j]]) docs[[j]] <- gsub("0x26", " ", docs[[j]]) docs[[j]] <- gsub("0x2F", " ", docs[[j]]) docs[[j]] <- gsub("0x2E", " ", docs[[j]]) docs[[j]] <- gsub("0x2D", " ", docs[[j]]) docs[[j]] <- gsub("0x2", " ", docs[[j]]) docs[[j]] <- gsub("0x31", " ", docs[[j]]) docs[[j]] <- gsub("x ", " ", docs[[j]]) docs[[j]] <- gsub("sounds", " ", docs[[j]]) docs[[j]] <- gsub("sound", " ", docs[[j]]) docs[[j]] <- gsub("Sounds", " ", docs[[j]]) docs[[j]] <- gsub("Sound", " ", docs[[j]]) docs[[j]] <- gsub("dog", " ", docs[[j]]) docs[[j]] <- gsub("bark", " ", docs[[j]]) } docs <- tm_map(docs, removeNumbers) docs <- tm_map(docs, tolower) #docs <- tm_map(docs, stripWhitespace) inspect(docs) writeLines(as.character(docs), con="allMyCorpus_unstem.csv") docs <- tm_map(docs, stemDocument) #docs <- tm_map(docs, stripWhitespace) inspect(docs[1]) #if list of words look ok, use line below to save as a new text file, then remove " and list() code #CHOOSE CORRECT FILE NAME writeLines(as.character(docs), con="allMyCorpus_stem.csv")