Verified Commit 23546d5a authored by Cody L Marquart's avatar Cody L Marquart
Browse files

Updated unseen word flow

parent 7d893281
Pipeline #55962 failed with stages
in 2 minutes and 59 seconds
......@@ -14,31 +14,36 @@ create_tm = function(text){
#' Unseen Words
#'
#' @param code Code object
#' @param unweighted boolean TRUE (default), binarized the document matrix, so multiple occurrences of single
#' @param unweighted logical TRUE (default), binarized the document matrix, so multiple occurrences of single
#' word in one line doesn't count multiple times
#' @param include_handcoded logical FALSE (default), will not use words from the handcoded set. If TRUE, the handcoded set will be used
#' @param exclude_matched_by character, either "excerpts" (default) or "word". "excerpts" will remove full excerpts that match an expression, "word" will remove words from WDM to not include in sum of excerpts
#'
#' @return Excerpts that contain unseen words
#'
#' @export
unseen_words <- function(code, unweighted = TRUE) {
unseen_words <- function(code, unweighted = TRUE, include_handcoded = FALSE, exclude_matched_by = c("excerpts", "word")) {
corpus <- code$excerpts;
expressions <- tolower(code$expressions);
exclude_matched_by <- match.arg(exclude_matched_by, c("excerpts", "word"))
# Set of all coded excerpts. Test Set will likely (should?) be
# empty. Including it in case additional excerpts are being added
# while in the test loop
coded_set <- rbind(code$testSet, code$trainingSet);
# Indices for excerpts coded by the user
hcy_indices <- coded_set[which(coded_set[, 2] == 1), 1];
hcn_indices <- coded_set[which(coded_set[, 2] == 0), 1];
# Indices for other excerpts that can be included
seen_indices <- code$touchedIndices;
explorable_indices <- code$touchableExcerpts;
# All indices to include in the intial Word By Document (WDM) matrix
word_by_document_indices <- c(hcy_indices, seen_indices, explorable_indices);
word_by_document_indices <- c(seen_indices, explorable_indices);
if(include_handcoded == TRUE) {
# Indices for excerpts coded by the user
hcy_indices <- coded_set[which(coded_set[, 2] == 1), 1];
word_by_document_indices <- c(hcy_indices, word_by_document_indices);
}
# Excerpts to build the WDM with
word_by_document_excerpts <- corpus[word_by_document_indices];
......@@ -56,20 +61,29 @@ unseen_words <- function(code, unweighted = TRUE) {
if (unweighted == TRUE) {
word_by_document_df[word_by_document_df > 0 ] = 1
}
# Find the unique words in the HCN for removal from WDM
hcn_excerpts <- corpus[hcn_indices];
# browser()
hcn_terms <- unique(gsub('[[:punct:] ]+','', tolower(tm::Boost_tokenizer(hcn_excerpts))))
filter_words <- c(hcn_terms, expressions);
filter_words <- c(expressions);
if(include_handcoded == TRUE) {
# Find the unique words in the HCN for removal from WDM
hcn_indices <- coded_set[which(coded_set[, 2] == 0), 1];
hcn_excerpts <- corpus[hcn_indices];
hcn_terms <- unique(gsub('[[:punct:] ]+','', tolower(tm::Boost_tokenizer(hcn_excerpts))))
filter_words <- c(hcn_terms, expressions);
}
filter_word_matches <- matrix(
sapply(filter_words, function(w) grepl(x = row.names(word_by_document_df), pattern = w)),
ncol = length(filter_words)
)
unseen_set <- word_by_document_df[!rowSums(filter_word_matches) > 0,]
if(exclude_matched_by == "word") {
unseen_set <- word_by_document_df[!rowSums(filter_word_matches) > 0,]
}
else if(exclude_matched_by == "excerpts") {
unseen_set <- word_by_document_df[, colSums(rowSums(filter_word_matches) * word_by_document_df) == 0]
}
unseen_set_sums <- colSums(unseen_set)
# browser()
if(any(unseen_set_sums > 0)) {
unseen_set_found <- unseen_set_sums[unseen_set_sums > 0]
as.integer(names(tail(sort(unseen_set_found), 2)))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment