Verified Commit 7d893281 authored by Cody L Marquart's avatar Cody L Marquart
Browse files

Use baserate of the explorable set

parent f62e4aa4
calc_statistics <- function(set, kappa_threshold = 0.9, baserate_inflation = 0.2) {
calc_statistics <- function(set, kappa_threshold = 0.9, baserate_inflation = 0.2, estimated_baserate = NULL) {
stats = list( rho = NA, kappa = NA )
if( nrow(set) > 0 ) {
# browser()
stats <- tryCatch(
rhoR::rhoSet(
set[, -1],
ScSKappaThreshold = kappa_threshold,
testSetBaserateInflation = baserate_inflation
testSetBaserateInflation = baserate_inflation,
OcSBaserate = estimated_baserate
),
error = function(x) {
warning("Unable to calculate valid rho, returning NA with kappa")
......@@ -39,8 +41,13 @@ test <- function(code, kappa_threshold = 0.65, baserate_inflation = 0.2, ...) {
code.to.use <- code$clone(deep = TRUE);
code.to.use$baserateInflation <- baserate_inflation;
code.to.use$computerSet[,2] = code.to.use$process(code.to.use$excerpts[code.to.use$computerSet[,1]])
code.to.use$baserate <- mean(code.to.use$process(code.to.use$excerpts[code.to.use$touchedIndices]))
touchable_coded <- code.to.use$process(excerpts = code.to.use$excerpts[code.to.use$touchableExcerpts])
code.to.use$baserate <- mean(code.to.use$process(
code.to.use$excerpts[c(code.to.use$touchedIndices, code.to.use$touchableExcerpts)]
))
# code.to.use$baserate <- mean(c(code.to.use$computerSet[,2], touchable_coded))
first_v_classifier_test <- NULL
first_v_classifier_train <- NULL
first_v_second <- NULL
......@@ -53,7 +60,7 @@ test <- function(code, kappa_threshold = 0.65, baserate_inflation = 0.2, ...) {
to.test <- to.test[!to.test$ID %in% code.to.use$additionalExcerpts,]
if(!any(is.na(to.test[, 2:3])))
first_v_classifier_test = calc_statistics(to.test)
first_v_classifier_test = calc_statistics(to.test, estimated_baserate = code.to.use$baserate)
}
if(nrow(code.to.use$trainingSet) > 0 || length(code.to.use$additionalExcerpts)) {
# browser()
......@@ -63,7 +70,7 @@ test <- function(code, kappa_threshold = 0.65, baserate_inflation = 0.2, ...) {
to.test[is.na(to.test[,3]), 3] = abs(to.test[is.na(to.test[,3]), 2] - 1)
to.test[!to.test[,3] %in% 0:1, 3] <- abs(to.test[!to.test[,3] %in% 0:1, 2] - 1)
first_v_classifier_train = calc_statistics(to.test)
first_v_classifier_train = calc_statistics(to.test, estimated_baserate = code.to.use$baserate)
}
first_v_classifier <- test_result(
......@@ -80,21 +87,21 @@ test <- function(code, kappa_threshold = 0.65, baserate_inflation = 0.2, ...) {
first_set <- rbind(code.to.use$testSet, code.to.use$trainingSet)
to.test <- merge(first_set, second_set_testing, by = 1, all = TRUE)
to.test <- to.test[rowSums(!is.na(to.test[,2:3]) * 1) > 1, ]
first_v_second <- calc_statistics(to.test)
first_v_second <- calc_statistics(to.test, estimated_baserate = code.to.use$baserate)
to.test <- merge(code.to.use$computerSet, second_set_testing, by = 1, all = TRUE)
to.test <- to.test[rowSums(!is.na(to.test[,2:3]) * 1) > 1, ]
second_v_classifier <- calc_statistics(to.test)
second_v_classifier <- calc_statistics(to.test, estimated_baserate = code.to.use$baserate)
}
if(nrow(second_set_training) > 0) {
first_set <- rbind(code.to.use$testSet, code.to.use$trainingSet)
to.test <- merge(first_set, second_set_training, by = 1, all = TRUE)
to.test <- to.test[rowSums(!is.na(to.test[,2:3]) * 1) > 1, ]
first_v_second_train <- calc_statistics(to.test)
first_v_second_train <- calc_statistics(to.test, estimated_baserate = code.to.use$baserate)
to.test <- merge(code.to.use$computerSet, second_set_training, by = 1, all = TRUE)
to.test <- to.test[rowSums(!is.na(to.test[,2:3]) * 1) > 1, ]
second_v_classifier_train <- calc_statistics(to.test)
second_v_classifier_train <- calc_statistics(to.test, estimated_baserate = code.to.use$baserate)
}
cloned_code <- code.to.use$clone(deep = TRUE)
......
......@@ -124,6 +124,8 @@ testthat::test_that("Verify unseen words used", {
name <- "letters"
# set <- 10:15
exprs <- c("(A|L)")
# Unseen text ----
text <- matrix(c(
## Holdout Set
"A B C",
......@@ -170,6 +172,7 @@ testthat::test_that("Verify unseen words used", {
"A X Y Z"
))
# Begin tests ----
newcode <- create.code(name, expressions = exprs, excerpts = text);
newcode$holdoutExcerpts <- 1:32;
newcode$touchableExcerpts <- 33:40;
......@@ -178,6 +181,9 @@ testthat::test_that("Verify unseen words used", {
testthat::expect_true(all(unseen_indices %in% c(36, 39)))
unseen_indices <- unseen_words(newcode, unweighted = FALSE)
testthat::expect_true(all(unseen_indices %in% c(34, 38)))
unseen_indices <- unseen_words(newcode, unweighted = FALSE, include_handcoded = TRUE)
testthat::expect_true(all(unseen_indices %in% c(35, 39)))
})
testthat::test_that("Verify unseen words with bad word as regex", {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment