Verified Commit a73abd11 authored by Cody L Marquart's avatar Cody L Marquart
Browse files

Fixes in unseen indices sampling

parent 23546d5a
Pipeline #63121 failed with stages
in 5 minutes and 2 seconds
......@@ -33,11 +33,11 @@ unseen_words <- function(code, unweighted = TRUE, include_handcoded = FALSE, exc
coded_set <- rbind(code$testSet, code$trainingSet);
# Indices for other excerpts that can be included
seen_indices <- code$touchedIndices;
explorable_indices <- code$touchableExcerpts;
seen_indices <- unique(code$touchedIndices);
explorable_indices <- unique(code$touchableExcerpts);
# All indices to include in the intial Word By Document (WDM) matrix
word_by_document_indices <- c(seen_indices, explorable_indices);
word_by_document_indices <- unique(c(seen_indices, explorable_indices));
if(include_handcoded == TRUE) {
# Indices for excerpts coded by the user
......@@ -76,6 +76,9 @@ unseen_words <- function(code, unweighted = TRUE, include_handcoded = FALSE, exc
ncol = length(filter_words)
)
word_by_document_df <- word_by_document_df[
,which(!colnames(word_by_document_df) %in% c(code$testSet[, 1], code$trainingSet[, 1]))
]
if(exclude_matched_by == "word") {
unseen_set <- word_by_document_df[!rowSums(filter_word_matches) > 0,]
}
......@@ -84,11 +87,11 @@ unseen_words <- function(code, unweighted = TRUE, include_handcoded = FALSE, exc
}
unseen_set_sums <- colSums(unseen_set)
ret <- NULL;
if(any(unseen_set_sums > 0)) {
unseen_set_found <- unseen_set_sums[unseen_set_sums > 0]
as.integer(names(tail(sort(unseen_set_found), 2)))
}
else {
NULL
ret <- as.integer(names(tail(sort(unseen_set_found), 2)))
}
return(ret);
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment