Verified Commit d5e4dfb9 authored by Cody L Marquart's avatar Cody L Marquart
Browse files

Fix for getHandSet

parent b6afe0a2
Pipeline #54946 passed with stages
in 7 minutes and 24 seconds
......@@ -55,37 +55,40 @@ getHandSetIndices2 = function(
# create data frame same length as excerpts -- used for keeping track of indices sampled
if(is.null(this.set)) {
indices = c(1:length(codeToUse$holdoutExcerpts))
# indices = c(1:length(codeToUse$holdoutExcerpts))
indices = codeToUse$holdoutExcerpts;
# set touchedIndices, testSet, and trainingSet to NA - ensure they won't be sampled again
indices[codeToUse$touchedIndices] = NA
indices[codeToUse$testSet[,1]] = NA
indices[codeToUse$trainingSet[,1]] = NA
indices[
indices %in% c(codeToUse$touchedIndices, codeToUse$testSet[,1], codeToUse$trainingSet[, 1])
] <- NA
# browser()
# get miniumum number of positive indices necessary
while(positives > 0){
if(length(codeToUse$holdoutExcerpts) - sum(is.na(indices)) < handSetLength) {
warning("Not enough excerpts left to generate set")
}
# browser()
while(positives > 0){
if(sum(is.na(indices)) == length(codeToUse$holdoutExcerpts) ){ # not enough positives to fill baserate
stop("Not enough positives in first rater to inflate to this level")
warning("Not enough positives in first rater to inflate to this level")
}
randIndice = sample(indices[!is.na(indices)],1) # get a random indice
indices[randIndice] = NA # set this indice to be NA, won't be sampled again
excerpt_id <- sample(indices[!is.na(indices)],1) # get a random indice
indices[indices == excerpt_id] = NA # set this indice to be NA, won't be sampled again
# autocode excerpt at this single indice
autocoded = codeToUse$process(codeToUse$excerpts[randIndice])
codeToUse$touchedIndices = c(codeToUse$touchedIndices, randIndice)
autocoded = codeToUse$process(codeToUse$excerpts[excerpt_id]);
codeToUse$touchedIndices = c(codeToUse$touchedIndices, excerpt_id);
if(autocoded == 1){
positives = positives - 1;
this.set = c(this.set, randIndice) # only adding positive indices to handset
# codeToUse$computerSet = rbind(codeToUse$computerSet, data.frame("ID" = randIndice, "X1" = autocoded))
codeToUse$computerSet = rbind(codeToUse$computerSet, c(randIndice, autocoded))
this.set = c(this.set, excerpt_id) # only adding positive indices to handset
codeToUse$computerSet = rbind(codeToUse$computerSet, c(excerpt_id, autocoded))
}
else{ # deal with nonPositive excerpt case
if(maxNonPositives != 0){ # room in handSet to add nonPositive -> add it
this.set = c(this.set, randIndice)
this.set = c(this.set, excerpt_id)
maxNonPositives = maxNonPositives - 1;
# codeToUse$computerSet = rbind(codeToUse$computerSet, data.frame("ID" = randIndice, "X1" = autocoded))
codeToUse$computerSet = rbind(codeToUse$computerSet, c(randIndice, autocoded))
codeToUse$computerSet = rbind(codeToUse$computerSet, c(excerpt_id, autocoded))
}
else{ # if max number of nonPositve indices is reached, save indice, but don't add to set
......@@ -94,15 +97,16 @@ getHandSetIndices2 = function(
}
# fill up the rest of handset with random indices, positive or not
if(maxNonPositives != 0){
for(i in 1:maxNonPositives){
randIndice = sample(indices[!is.na(indices)],1)
autocoded = codeToUse$process(codeToUse$excerpts[randIndice])
indices[randIndice] = NA
this.set = c(this.set, randIndice)
codeToUse$computerSet = rbind(codeToUse$computerSet, c(randIndice, autocoded))
if (maxNonPositives != 0) {
for(i in 1:maxNonPositives) {
excerpt_id <- sample(indices[!is.na(indices)],1) # get a random indice
autocoded = codeToUse$process(codeToUse$excerpts[excerpt_id])
indices[indices == excerpt_id] = NA # set this indice to be NA, won't be sampled again
this.set = c(this.set, excerpt_id)
codeToUse$computerSet = rbind(codeToUse$computerSet, c(excerpt_id, autocoded))
}
}
# browser()
additional_excerpts = NULL;
# if(unseen == TRUE) {
......
......@@ -36,3 +36,12 @@ test_that("Check the touched sets", {
testthat::expect_equal(0.1, touchable_perc, tolerance = 0.01)
testthat::expect_equal(0.9, holdout_perc, tolerance = 0.01)
})
test_that("Avoiding duplicates in test set", {
d <- data.frame(id = rev(seq(LETTERS)), letter = LETTERS)
d <- rbind(d, d, d)
d <- rbind(d, d, d)
exprs <- c("M", "N", "O", "P")
newcode <- create.code("letters", expressions = exprs, excerpts = d$letter);
newcode <- getHandSetIndices2(code = newcode, handSetLength = 10, handSetBaserate = 0.1);
nrow(newcode$testSet);
})
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment