Verified Commit f62e4aa4 authored by Cody L Marquart's avatar Cody L Marquart
Browse files

Unseen word fixes

parent d5e4dfb9
Pipeline #55345 passed with stages
in 5 minutes and 4 seconds
......@@ -72,7 +72,7 @@ getHandSetIndices2 = function(
if(sum(is.na(indices)) == length(codeToUse$holdoutExcerpts) ){ # not enough positives to fill baserate
warning("Not enough positives in first rater to inflate to this level")
}
# browser()
excerpt_id <- sample(indices[!is.na(indices)],1) # get a random indice
indices[indices == excerpt_id] = NA # set this indice to be NA, won't be sampled again
......@@ -101,6 +101,7 @@ getHandSetIndices2 = function(
for(i in 1:maxNonPositives) {
excerpt_id <- sample(indices[!is.na(indices)],1) # get a random indice
autocoded = codeToUse$process(codeToUse$excerpts[excerpt_id])
codeToUse$touchedIndices = c(codeToUse$touchedIndices, excerpt_id);
indices[indices == excerpt_id] = NA # set this indice to be NA, won't be sampled again
this.set = c(this.set, excerpt_id)
codeToUse$computerSet = rbind(codeToUse$computerSet, c(excerpt_id, autocoded))
......@@ -142,12 +143,23 @@ getHandSetIndices2 = function(
}
if(unseen == TRUE) {
message("Unseen excerpts is in beta and still being tested")
codeToUse$additionalExcerpts <- unseen_words(codeToUse);
newRows <- data.frame(cbind(ID = codeToUse$additionalExcerpts, codeToUse$process(excerpts = codeToUse$excerpts[codeToUse$additionalExcerpts])))
colnames(newRows) <- colnames(codeToUse$computerSet)
codeToUse$computerSet = rbind(codeToUse$computerSet, newRows);
# this.set = c(this.set, codeToUse$additionalExcerpts);
message("Unseen excerpts is in beta and still being tested");
codeToUse$additionalExcerpts <- tryCatch(
unseen_words(codeToUse),
error = function(x) {
warning("Unable to add unseen words")
c()
}
)
codeToUse$touchedIndices = c(codeToUse$touchedIndices, codeToUse$additionalExcerpts);
if( !is.null(codeToUse$additionalExcerpts) && length(codeToUse$additionalExcerpts) > 0 ) {
newRows <- data.frame(cbind(
ID = codeToUse$additionalExcerpts,
codeToUse$process(excerpts = codeToUse$excerpts[codeToUse$additionalExcerpts])
));
colnames(newRows) <- colnames(codeToUse$computerSet);
codeToUse$computerSet = rbind(codeToUse$computerSet, newRows);
}
}
return(codeToUse);
......
......@@ -143,7 +143,7 @@ handcode = function(
), ncol = 2, dimnames = list(NULL, c("ID", code.to.use$name)))
}
}
# browser()
selfCoded_in_test <- which(!selfCodes[, 1] %in% code.to.use$additionalExcerpts);
selfCoded_in_test_IDs <- selfCodes[selfCoded_in_test, 1];
selfCoded_in_addl <- which(selfCodes[, 1] %in% code.to.use$additionalExcerpts);
......@@ -154,7 +154,9 @@ handcode = function(
})
if(length(selfCoded_in_addl) > 0) {
code.to.use$trainingSet = rbind(code.to.use$trainingSet, selfCodes[selfCoded_in_addl,])
new_training_set_rows <- selfCodes[selfCoded_in_addl,]
dimnames(new_training_set_rows) <- dimnames(code.to.use$trainingSet) #c("ID", "X1")
code.to.use$trainingSet = rbind(code.to.use$trainingSet, new_training_set_rows)
}
code.to.use
......
......@@ -14,35 +14,67 @@ create_tm = function(text){
#' Unseen Words
#'
#' @param code Code object
#' @param unweighted boolean TRUE (default), binarized the document matrix, so multiple occurrences of single
#' word in one line doesn't count multiple times
#'
#' @return Excerpts that contain unseen words
#'
#' @export
unseen_words <- function(code) {
text <- code$excerpts
words <- code$expressions
coded <- code$testSet[, 1];
hcy.index <- code$testSet[code$testSet[, 2] == 1, 1]
hcn.index <- code$testSet[code$testSet[, 2] == 0, 1]
unseen_words <- function(code, unweighted = TRUE) {
corpus <- code$excerpts;
expressions <- tolower(code$expressions);
# Set of all coded excerpts. Test Set will likely (should?) be
# empty. Including it in case additional excerpts are being added
# while in the test loop
coded_set <- rbind(code$testSet, code$trainingSet);
hcn.excerpts <- text[hcn.index];
# Indices for excerpts coded by the user
hcy_indices <- coded_set[which(coded_set[, 2] == 1), 1];
hcn_indices <- coded_set[which(coded_set[, 2] == 0), 1];
touchedIndices.c <- code$touchedIndices;
touchableIndices.c <- code$touchableExcerpts;
# Indices for other excerpts that can be included
seen_indices <- code$touchedIndices;
explorable_indices <- code$touchableExcerpts;
prioritized.indices <- c(hcy.index, touchedIndices.c, touchableIndices.c);
prioritized.excerpts <- text[prioritized.indices];
#create a prioritized vocab matrix
prioritized_tm <- create_tm(prioritized.excerpts);
prioritized.df <- as.data.frame(as.matrix(prioritized_tm));
# All indices to include in the intial Word By Document (WDM) matrix
word_by_document_indices <- c(hcy_indices, seen_indices, explorable_indices);
# Excerpts to build the WDM with
word_by_document_excerpts <- corpus[word_by_document_indices];
#create a word document matrix for handcode no exerpts
hcn.terms <- create_tm(hcn.excerpts);
filter_words <- c(hcn.terms$dimnames$Terms, words);
unseen_set <- prioritized.df[!row.names(prioritized.df) %in% filter_words,];
# Generate the WDM
word_by_document <- create_tm(word_by_document_excerpts);
# Convert WDM to data.frame
word_by_document_df <- as.data.frame(matrix(
data = word_by_document,
ncol = word_by_document$ncol,
dimnames = list(word_by_document$dimnames$Terms, word_by_document_indices)
))
if (unweighted == TRUE) {
word_by_document_df[word_by_document_df > 0 ] = 1
}
ordered_summed <- order(colSums(unseen_set), decreasing = TRUE);
head(ordered_summed[!ordered_summed %in% coded], 2);
# Find the unique words in the HCN for removal from WDM
hcn_excerpts <- corpus[hcn_indices];
# browser()
hcn_terms <- unique(gsub('[[:punct:] ]+','', tolower(tm::Boost_tokenizer(hcn_excerpts))))
filter_words <- c(hcn_terms, expressions);
filter_word_matches <- matrix(
sapply(filter_words, function(w) grepl(x = row.names(word_by_document_df), pattern = w)),
ncol = length(filter_words)
)
unseen_set <- word_by_document_df[!rowSums(filter_word_matches) > 0,]
unseen_set_sums <- colSums(unseen_set)
# browser()
if(any(unseen_set_sums > 0)) {
unseen_set_found <- unseen_set_sums[unseen_set_sums > 0]
as.integer(names(tail(sort(unseen_set_found), 2)))
}
else {
NULL
}
}
\ No newline at end of file
......@@ -121,22 +121,124 @@ testthat::test_that("Calculate precision and recall", {
testthat::expect_equal(newcode4$statistics[[1]]$one_v_classifier$test_set$precision, 1)
})
testthat::test_that("Verify unseen words used", {
name <- "Data"
set <- 10:15
exprs <- c("number","priority")
coded <- sapply(rs_text[set], grepl, pattern = paste0(exprs, collapse = "|")) * 1
newcode <- create.code(name, expressions = exprs, excerpts = rs_text)
newcode <- handcode(code = newcode, this.set = set, results = c(rep(0, 5), 1, 0, 0), unseen = TRUE)
newcode <- test(code = newcode)
testthat::expect_equal(length(newcode$statistics), 1)
first_stats <- newcode$statistics[[1]]
testthat::expect_equal(first_stats$one_v_classifier$test_set$N, length(set))
testthat::expect_equal(
as.numeric(first_stats$one_v_classifier$test_set$set[,2]),
coded
)
name <- "letters"
# set <- 10:15
exprs <- c("(A|L)")
text <- matrix(c(
## Holdout Set
"A B C",
"B D", # HCN
"A A A A B B A D",
"E E E E E E E E F",
"C C C C B",
"A A D D", # HCY
"A B C",
"B D",
"A A A A B B A D",
"E E E E E E E E F",
"C C C C B",
"A A D D",
"A B C",
"B D",
"A A A A B B A D",
"E E E E E E E E F",
"A B C",
"B D",
"A A A A B B A D",
"E E E E E E E E F",
"C C C C B",
"A A D D",
"A B C",
"B D",
"A A A A B B A D",
"E E E E E E E E F",
"C C C C B",
"A A D D",
"A B C",
"B D",
"A A A A B B A D",
"E E E E E E E E F",
## Explorable Set
"B B B C C C",
"D D D D E E B B",
"X X Y Y Z",
"W X Y Z",
"B B B C C C",
"D D D D E E B B",
"M X Y Z",
"A X Y Z"
))
newcode <- create.code(name, expressions = exprs, excerpts = text);
newcode$holdoutExcerpts <- 1:32;
newcode$touchableExcerpts <- 33:40;
newcode$trainingSet <- data.frame(ID = c(6,2), X1 = c(1, 0))
unseen_indices <-unseen_words(newcode, unweighted = TRUE)
testthat::expect_true(all(unseen_indices %in% c(36, 39)))
unseen_indices <- unseen_words(newcode, unweighted = FALSE)
testthat::expect_true(all(unseen_indices %in% c(35, 39)))
})
testthat::test_that("Verify unseen words with bad word as regex", {
name <- "letters"
# set <- 10:15
exprs <- c("(A|L)")
text <- matrix(c(
## Holdout Set
"A B C",
"B D", # HCN
"A A A A B B A D",
"E E E E E E E E F",
"C C C C B",
"A A D D", # HCY
"A B C",
"B D",
"A A A A B B A D",
"E E E E E E E E F",
"C C C C B",
"A A D D",
"A B C",
"B D",
"A A A A B B A D",
"E E E E E E E E F",
"A B C",
"B D",
"A A A A B B A D",
"E E E E E E E E F",
"C C C C B",
"A A D D",
"A B C",
"B D",
"A A A A B B A D",
"E E E E E E E E F",
"C C C C B",
"A A D D",
"A B C",
"B D",
"A A A A B B A D",
"E E E E E E E E F",
## Explorable Set
"A (A A A A",
"A A A A A",
"A A A A A",
"A A A A A",
"A A A A A",
"A A A A A",
"A A A A A",
"A A A A A"
))
newcode <- create.code(name, expressions = exprs, excerpts = text);
newcode$holdoutExcerpts <- 1:32;
newcode$touchableExcerpts <- 33:40;
newcode$touchedIndices <- NULL;
newcode$trainingSet <- data.frame(ID = c(6,2), X1 = c(0, 0))
unseen_indices <-unseen_words(newcode, unweighted = TRUE)
testthat::expect_null(unseen_indices)
newcode$excerpts[40] <- "A (A A K"
unseen_indices <-unseen_words(newcode, unweighted = TRUE)
testthat::expect_equal(unseen_indices, c(40))
})
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment