Documente Academic
Documente Profesional
Documente Cultură
#0. Preliminaries
rm(list = ls())
setwd("~/Desktop")
grep("\\?",my_string_vector)
grepl("\\?",my_string_vector[1])
str_replace_all(my_string, "e","___")
str_extract_all(my_string,"[0-9]+")
sentence <- "The term 'data science' (originally used interchangeably with
'datalogy') has existed for over thirty years and was used initially as a
substitute for computer science by Peter Naur in 1960."
clean_sentence <- Clean_String(sentence)
print(clean_sentence)
return(to_return)
}
# make sure that you save this file in your current working directory (in my
# case the Desktop)
con <- file("Obama_Speech_2-24-09.txt", "r", blocking = FALSE)
text <- readLines(con)
close(con)
# jsut make sure you saved the cpp file to the currentworking directory
Rcpp::sourceCpp('Generate_Document_Word_Matrix.cpp')
#' Create a list containing a vector of tokens in each document for each
#' document. These can be extracted from the cleaned text objects as follows.
doc_list <- list(clean_speech$text, clean_speech2$text)
#' Make sure to add column names to Doc_Term_Matrix, then take a look:
colnames(Doc_Term_Matrix) <- unique_words