Verified Commit 1c176f09 authored by Cody L Marquart's avatar Cody L Marquart
Browse files

Training set stats for second rater

parent b69a2878
Pipeline #46958 failed with stages
in 1 minute and 58 seconds
calc_statistics <- function(set, kappa_threshold = 0.65, baserate_inflation = 0.2) {
calc_statistics <- function(set, kappa_threshold = 0.9, baserate_inflation = 0.2) {
stats = list( rho = NA, kappa = NA )
if( nrow(set) > 0 ) {
......@@ -37,11 +37,15 @@ test <- function(code, kappa_threshold = 0.65, baserate_inflation = 0.2, ...) {
}
code.to.use <- code$clone(deep = TRUE);
code.to.use$baserateInflation <- baserate_inflation;
code.to.use$computerSet[,2] = code.to.use$process(code.to.use$excerpts[code.to.use$computerSet[,1]])
code.to.use$baserate <- mean(code.to.use$process(code.to.use$excerpts[code.to.use$touchedIndices]))
first_v_classifier_test <- NULL
first_v_classifier_train <- NULL
first_v_second <- NULL
second_v_classifier <- NULL
first_v_second_train <- NULL
second_v_classifier_train <- NULL
if(nrow(code$testSet) > 0) {
to.test <- merge(code.to.use$computerSet, code.to.use$testSet, by = 1)
......@@ -61,23 +65,42 @@ test <- function(code, kappa_threshold = 0.65, baserate_inflation = 0.2, ...) {
training_set = first_v_classifier_train
)
if(nrow(code.to.use$secondRaterSet) > 0) {
second_set_training_ids <- unique(as.numeric(unlist(sapply(code$statistics, function(s) {
s$one_v_two$test_set$set$ID
}))))
second_set_training <- code.to.use$secondRaterSet[code.to.use$secondRaterSet[,1] %in% second_set_training_ids,]
second_set_testing <- code.to.use$secondRaterSet[!code.to.use$secondRaterSet[,1] %in% second_set_training_ids,]
if(nrow(second_set_testing) > 0) {
first_set <- rbind(code.to.use$testSet, code.to.use$trainingSet)
to.test <- merge(first_set, code.to.use$secondRaterSet, by = 1, all = TRUE)
to.test <- merge(first_set, second_set_testing, by = 1, all = TRUE)
to.test <- to.test[rowSums(!is.na(to.test[,2:3]) * 1) > 1, ]
first_v_second <- calc_statistics(to.test)
to.test <- merge(code.to.use$computerSet, code.to.use$secondRaterSet, by = 1, all = TRUE)
to.test <- merge(code.to.use$computerSet, second_set_testing, by = 1, all = TRUE)
to.test <- to.test[rowSums(!is.na(to.test[,2:3]) * 1) > 1, ]
second_v_classifier <- calc_statistics(to.test)
}
if(nrow(second_set_training) > 0) {
first_set <- rbind(code.to.use$testSet, code.to.use$trainingSet)
to.test <- merge(first_set, second_set_training, by = 1, all = TRUE)
to.test <- to.test[rowSums(!is.na(to.test[,2:3]) * 1) > 1, ]
first_v_second_train <- calc_statistics(to.test)
to.test <- merge(code.to.use$computerSet, second_set_training, by = 1, all = TRUE)
to.test <- to.test[rowSums(!is.na(to.test[,2:3]) * 1) > 1, ]
second_v_classifier_train <- calc_statistics(to.test)
}
cloned_code <- code.to.use$clone(deep = TRUE)
cloned_code$statistics <- NULL
cloned_code$excerpts <- NULL
cloned_code$holdoutExcerpts <- NULL
cloned_code$touchableExcerpts <- NULL
new_tests <- list(
one_v_classifier = first_v_classifier,
one_v_two = test_result(test_set = first_v_second),
two_v_classifier = test_result(test_set = second_v_classifier),
one_v_two = test_result(test_set = first_v_second, training_set = first_v_second_train),
two_v_classifier = test_result(test_set = second_v_classifier, training_set = second_v_classifier_train),
classifier = cloned_code
)
......
......@@ -28,7 +28,7 @@ test_that("Verify statistics output", {
testthat::expect_equal(first_stats$one_v_classifier$test_set$N, length(set))
testthat::expect_equal(
first_stats$one_v_classifier$test_set$set[,2],
as.numeric(first_stats$one_v_classifier$test_set$set[,2]),
coded
)
})
......
data(RS.data)
rs_text = RS.data$text
test_that("Basic second rater", {
name <- "Data"
set <- 10:15
exprs <- c("number","priority")
coded <- sapply(rs_text[set], grepl, pattern = paste0(exprs, collapse = "|")) * 1
newcode <- create.code(name, expressions = exprs, excerpts = rs_text) %>%
handcode(this.set = set, results = 0)
newcode <- test(code = newcode, kappa_threshold = 0.9)
testthat::expect_equal(length(newcode$statistics), 1)
newcode$secondRaterSet <- data.frame(ID = newcode$testSet[,1], newcode$computerSet[,2])
newcode <- test(code = newcode, kappa_threshold = 0.9)
testthat::expect_equal(length(newcode$statistics), 2)
testthat::expect_equal(newcode$statistics[[2]]$two_v_classifier$test_set$kappa, 1)
testthat::expect_null(newcode$statistics[[2]]$one_v_two$training_set)
})
test_that("Second rater training set", {
name <- "Data"
set_one <- 10:15
set_two <- 16:25
exprs <- c("number","priority")
coded <- sapply(rs_text[set_one], grepl, pattern = paste0(exprs, collapse = "|")) * 1
newcode <- create.code(name, expressions = exprs, excerpts = rs_text) %>%
handcode(this.set = set_one, results = 0) %>%
test(kappa_threshold = 0.9)
newcode$secondRaterSet <- data.frame(ID = newcode$testSet[,1], newcode$computerSet[,2])
newcode <- test(code = newcode, kappa_threshold = 0.9)
newcode <- handcode(code = newcode, this.set = set_two, results = 0) %>%
test(kappa_threshold = 0.9)
testthat::expect_null(newcode$statistics[[3]]$one_v_two$test_set)
testthat::expect_equal(newcode$statistics[[3]]$one_v_two$training_set$set$ID, set_one)
newcode$secondRaterSet <- rbind(
newcode$secondRaterSet,
data.frame(ID = newcode$testSet[,1], "Data" = 1)
)
newcode <- test(code = newcode, kappa_threshold = 0.9)
newcode$statistics[[4]]$one_v_two$test_set
testthat::expect_false(is.null(newcode$statistics[[4]]$one_v_two$test_set))
testthat::expect_equal(newcode$statistics[[4]]$one_v_two$training_set$set$ID, set_one)
testthat::expect_equal(newcode$statistics[[4]]$one_v_two$test_set$set$ID, set_two)
})
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment