Unverified Commit b203ee0a authored by Cody L Marquart's avatar Cody L Marquart
Browse files

Merge branch 'develop'

parents aca087f7 ed30d4d2
Pipeline #47526 passed with stages
in 8 minutes and 32 seconds
......@@ -7,7 +7,7 @@ stages:
- release
variables:
VERSION: "0.1.3.1"
VERSION: "0.2.0.0"
PKG: "ncodeR"
PKGL: "ncoder"
REG_URL: "${CI_REGISTRY}/epistemic-analytics/qe-packages/ncoder"
......
......@@ -4,7 +4,7 @@ Type: Package
Author: Cody L Marquart [aut, cre], Zachari Swiecki [aut], Brendan Eagan [aut], David Williamson Shaffer [aut]
Authors@R: c(person("Cody L","Marquart", role = c("aut", "cre"), email="cody.marquart@wisc.edu", comment = c(ORCID = "0000-0002-3387-6792")),person("Zachari","Swiecki", role = c("aut"), email="swiecki@wisc.edu"),person("Brendan","Eagan", role = c("aut"), email="beagan@wisc.edu"),person("David", "Williamson Shaffer", role = c("aut"), email = "dws@education.wisc.edu"))
Maintainer: Cody L Marquart <cody.marquart@wisc.edu>
Version: 0.1.3.1
Version: 0.2.0.0
Description: A set of techniques that can be used to develop, validate, and implement automated classifiers. A powerful tool for transforming raw data into meaningful information, 'ncodeR' (Shaffer, D. W. (2017) Quantitative Ethnography. ISBN: 0578191687) is designed specifically for working with big data: large document collections, logfiles, and other text data.
LazyData: TRUE
BugReports: https://gitlab.com/epistemic-analytics/qe-packages/ncoder/issues
......@@ -17,6 +17,7 @@ Imports:
cli
Suggests:
testthat,
magrittr,
knitr,
rmarkdown
RoxygenNote: 6.1.1
......
......@@ -4,11 +4,9 @@ S3method(as.data.frame,Code)
S3method(as.data.frame,CodeSet)
S3method(print,summary.Code)
S3method(print,summary.CodeSet)
S3method(print,summary.Test)
S3method(print,summary.TestList)
S3method(summary,Code)
S3method(summary,CodeSet)
S3method(summary,Test)
S3method(summary,TestList)
export(CodeSet)
export(RegexCode)
......
calc_statistics <- function(set, kappa_threshold = 0.65, baserate_inflation = 0.2) {
calc_statistics <- function(set, kappa_threshold = 0.9, baserate_inflation = 0.2) {
stats = list( rho = NA, kappa = NA )
if( nrow(set) > 0 ) {
......@@ -25,17 +25,27 @@ calc_statistics <- function(set, kappa_threshold = 0.65, baserate_inflation = 0.
#' @param code [TBD]
#' @param kappa_threshold [TBD]
#' @param baserate_inflation [TBD]
#' @param ... [TBD]
#'
#' @return code object
#' @export
test <- function(code, kappa_threshold = 0.65, baserate_inflation = 0.2) {
code.to.use <- code$clone(deep=T);
test <- function(code, kappa_threshold = 0.65, baserate_inflation = 0.2, ...) {
args <- list(...)
if(!is.null(args$kappaThreshold)) {
warning("Use kappa_threshold instead of kappaThreshold. kappaThreshold will be deprecated in a future version.")
kappa_threshold <- args$kappaThreshold
}
code.to.use <- code$clone(deep = TRUE);
code.to.use$baserateInflation <- baserate_inflation;
code.to.use$computerSet[,2] = code.to.use$process(code.to.use$excerpts[code.to.use$computerSet[,1]])
code.to.use$baserate <- mean(code.to.use$process(code.to.use$excerpts[code.to.use$touchedIndices]))
first_v_classifier_test <- NULL
first_v_classifier_train <- NULL
first_v_second <- NULL
second_v_classifier <- NULL
first_v_second_train <- NULL
second_v_classifier_train <- NULL
if(nrow(code$testSet) > 0) {
to.test <- merge(code.to.use$computerSet, code.to.use$testSet, by = 1)
......@@ -55,24 +65,46 @@ test <- function(code, kappa_threshold = 0.65, baserate_inflation = 0.2) {
training_set = first_v_classifier_train
)
if(nrow(code.to.use$secondRaterSet) > 0) {
second_set_training_ids <- unique(as.numeric(unlist(sapply(code$statistics, function(s) {
s$one_v_two$test_set$set$ID
}))))
second_set_training <- code.to.use$secondRaterSet[code.to.use$secondRaterSet[,1] %in% second_set_training_ids,]
second_set_testing <- code.to.use$secondRaterSet[!code.to.use$secondRaterSet[,1] %in% second_set_training_ids,]
if(nrow(second_set_testing) > 0) {
first_set <- rbind(code.to.use$testSet, code.to.use$trainingSet)
to.test <- merge(first_set, code.to.use$secondRaterSet, by = 1, all = TRUE)
to.test <- merge(first_set, second_set_testing, by = 1, all = TRUE)
to.test <- to.test[rowSums(!is.na(to.test[,2:3]) * 1) > 1, ]
first_v_second <- calc_statistics(to.test)
to.test <- merge(code.to.use$computerSet, code.to.use$secondRaterSet, by = 1, all = TRUE)
to.test <- merge(code.to.use$computerSet, second_set_testing, by = 1, all = TRUE)
to.test <- to.test[rowSums(!is.na(to.test[,2:3]) * 1) > 1, ]
second_v_classifier <- calc_statistics(to.test)
}
if(nrow(second_set_training) > 0) {
first_set <- rbind(code.to.use$testSet, code.to.use$trainingSet)
to.test <- merge(first_set, second_set_training, by = 1, all = TRUE)
to.test <- to.test[rowSums(!is.na(to.test[,2:3]) * 1) > 1, ]
first_v_second_train <- calc_statistics(to.test)
to.test <- merge(code.to.use$computerSet, second_set_training, by = 1, all = TRUE)
to.test <- to.test[rowSums(!is.na(to.test[,2:3]) * 1) > 1, ]
second_v_classifier_train <- calc_statistics(to.test)
}
cloned_code <- code.to.use$clone(deep = TRUE)
cloned_code$statistics <- NULL
cloned_code$excerpts <- NULL
cloned_code$holdoutExcerpts <- NULL
cloned_code$touchableExcerpts <- NULL
new_tests <- list(
one_v_classifier = first_v_classifier,
one_v_two = test_result(test_set = first_v_second),
two_v_classifier = test_result(test_set = second_v_classifier)
one_v_two = test_result(test_set = first_v_second, training_set = first_v_second_train),
two_v_classifier = test_result(test_set = second_v_classifier, training_set = second_v_classifier_train),
classifier = cloned_code
)
code.to.use$setValue("testedTestSet", T)
code.to.use$setValue("testedTestSet", TRUE)
code.to.use$statistics[[length(code.to.use$statistics) + 1]] <- new_tests
code.to.use
}
......
......@@ -73,21 +73,22 @@ getHandSetIndices2 = function(
# autocode excerpt at this single indice
autocoded = codeToUse$process(codeToUse$excerpts[randIndice])
codeToUse$touchedIndices = c(codeToUse$touchedIndices, randIndice)
if(autocoded == 1){
positives = positives - 1;
# this.set = c(this.set, randIndice) # only adding positive indices to handset
this.set = c(this.set, randIndice) # only adding positive indices to handset
# codeToUse$computerSet = rbind(codeToUse$computerSet, data.frame("ID" = randIndice, "X1" = autocoded))
codeToUse$computerSet = rbind(codeToUse$computerSet, c(randIndice, autocoded))
}
else{ # deal with nonPositive excerpt case
if(maxNonPositives != 0){ # room in handSet to add nonPositive -> add it
# this.set = c(this.set, randIndice)
this.set = c(this.set, randIndice)
maxNonPositives = maxNonPositives - 1;
# codeToUse$computerSet = rbind(codeToUse$computerSet, data.frame("ID" = randIndice, "X1" = autocoded))
codeToUse$computerSet = rbind(codeToUse$computerSet, c(randIndice, autocoded))
}
else{ # if max number of nonPositve indices is reached, save indice, but don't add to set
codeToUse$touchedIndices = c(codeToUse$touchedIndices, randIndice)
}
}
}
......@@ -98,7 +99,7 @@ getHandSetIndices2 = function(
randIndice = sample(indices[!is.na(indices)],1)
autocoded = codeToUse$process(codeToUse$excerpts[randIndice])
indices[randIndice] = NA
# this.set = c(this.set, randIndice)
this.set = c(this.set, randIndice)
codeToUse$computerSet = rbind(codeToUse$computerSet, c(randIndice, autocoded))
}
}
......@@ -108,16 +109,25 @@ getHandSetIndices2 = function(
}
# randomize output order of indices
# codeToUse$testSet = rbind(
# codeToUse$testSet,
# #data.frame(ID=this.set[sample.int(n = handSetLength)], X1=NA)
# data.frame(ID=codeToUse$computerSet[,1][sample.int(n = handSetLength)], X1=NA)
# )
codeToUse$testSet = rbind(
codeToUse$testSet,
#data.frame(ID=this.set[sample.int(n = handSetLength)], X1=NA)
data.frame(ID=codeToUse$computerSet[,1][sample.int(n = handSetLength)], X1=NA)
matrix(
c(this.set[sample.int(n = handSetLength)], rep(NA, handSetLength)),
ncol = 2, nrow = handSetLength,
byrow = F, dimnames = list(NULL, colnames(codeToUse$testSet))
)
)
}
else {
autocoded = codeToUse$process(codeToUse$excerpts[this.set])
codeToUse$computerSet = rbind(codeToUse$computerSet, data.frame("ID" = this.set, "X1" = autocoded))
codeToUse$touchedIndices <- c(codeToUse$touchedIndices, this.set)
codeToUse$testSet = rbind(
codeToUse$testSet,
data.frame(ID=this.set, X1=NA)
......
......@@ -141,7 +141,8 @@ handcode = function(
coding = recoding = grepl(x=tolower(keepOn),pattern="^a",perl=T)
}
}
} else {
}
else {
selfCodes = matrix(c(
this.set,
rep(results, length(this.set) / length(results))
......
......@@ -13,12 +13,14 @@ resolve <- function(code = NULL, trainingSet = NULL, computerSet = NULL, express
code.to.use = NULL;
if(!is.null(code)) {
code.to.use = code$clone(deep = T);
} else {
}
else {
code.to.use = create.code();
}
if(!is.null(excerpts)) {
code.to.use$excerpts = excerpts;
} else {
}
else {
excerpts = code.to.use$excerpts;
}
if(!is.null(expressions)) {
......@@ -26,7 +28,8 @@ resolve <- function(code = NULL, trainingSet = NULL, computerSet = NULL, express
}
if(!is.null(ignored)) {
code.to.use$ignoredSet = ignored;
} else if(is.null(code.to.use$ignoredSet)) {
}
else if(is.null(code.to.use$ignoredSet)) {
ignored = c();
code.to.use$ignoredSet = ignored;
}
......@@ -156,7 +159,8 @@ resolve <- function(code = NULL, trainingSet = NULL, computerSet = NULL, express
if(!is.null(trainingSet)) {
code.to.use$trainingSet = trainingSet;
} else {
}
else {
if(nrow(code.to.use$testSet) && code.to.use$getValue("testedTestSet") == F) {
justResolve = resolveFirst()
if(justResolve != T) {
......
......@@ -76,7 +76,7 @@ print.summary.CodeSet = function(x, ...){
#' rs = RS.data
#' newcode = create.code(name = "Data",
#' expressions = c("number","data"), excerpts = rs$text)
#' #newcode = handcode(code = newcode, excerpts = rs$text, n = 4)
#' newcode <- handcode(newcode, this.set = 10:15, results = 0)
#' newcode = test(code = newcode, kappa_threshold = 0.65)
#' summary(newcode$statistics)
#' @return list of Test summary
......@@ -84,20 +84,17 @@ print.summary.CodeSet = function(x, ...){
summary.TestList = function(object, ...) {
args = list(...)
stats <- object[[length(object)]]$one_v_classifier
which.tests = c("test","training")
if(!is.null(args$which.tests)) {
which.tests = args$which.tests
}
summary = list(
stats$test_set$kappa, stats$test_set$N, stats$test_set$rho,
stats$training_set$kappa, stats$training_set$N
)
summaries = sapply(which.tests, function(t) {
res = object[[paste0(t,"Set")]]
if(!is.null(res) && length(res) > 0) {
summary(res[[length(res)]], which.test = t)
}
})
summaries
class(summary) = "summary.TestList"
summary
}
#' Print a TestList summary
#'
#' @param x list from summary()
......@@ -106,88 +103,35 @@ summary.TestList = function(object, ...) {
#' @examples
#' data(RS.data)
#' rs = RS.data
#' newcode = create.code( name = "Data",
#' expressions = c("number","data"), excerpts = rs$text)
#' #newcode = handcode(code = newcode, excerpts = rs$text, n = 4)
#' newcode <- create.code("Data", expressions = c("number","data"), excerpts = rs$text)
#' newcode <- handcode(newcode, this.set = 10:15, results = 0)
#' newcode = test(code = newcode, kappa_threshold = 0.65)
#' summary(newcode$statistics)
#'
#' @return prints summary
#' @export
print.summary.TestList = function(x, ...) {
browser()
}
#' Obtain a summary of a Code's test results
#'
#' @param object Test object of Code
#' @param ... Additional parameters
#'
#' @examples
#' data(RS.data)
#' rs = RS.data
#' newcode = create.code(name = "Data",
#' expressions = c("number","data"), excerpts = rs$text)
#' #newcode = handcode(code = newcode, excerpts = rs$text, n = 4)
#' newcode = test(code = newcode, kappa_threshold = 0.65)
#' summary(newcode$statistics)
#' @return list of Test summary
#' @export
summary.Test = function(object, ...) {
# browser()
args = list(...)
width = 40;
which.test = "test"
if(!is.null(args$which.test)) which.test = args$which.test;
if(!is.null(args$width)) width = args$width
this.summary = c(
which = which.test,
kappa = object$kappa,
N = object$N
)
test_vals = paste0(float(x[[1]]),"\t| ",float(x[[3]]),"\t| ",x[[2]])
if(which.test == "test") {
this.summary = c(this.summary, rho = object$rho)
}
training_vals = paste0(float(x[[4]]),"\t| ",x[[5]])
class(this.summary) = "summary.Test"
this.summary
}
#' Print a Test summary
#'
#' @param x list from summary()
#' @param ... Additional parameters
#'
#' @examples
#' data(RS.data)
#' rs = RS.data
#' newcode = create.code( name = "Data",
#' expressions = c("number","data"), excerpts = rs$text)
#' #newcode = handcode(code = newcode, excerpts = rs$text, n = 4)
#' newcode = test(code = newcode, kappa_threshold = 0.65)
#' summary(newcode$statistics)
#' @return prints summary
#' @export
print.summary.Test = function(x, ...) {
args = list(...)
width = 40;
if(!is.null(args$width)) width = args$width
header = NULL
vals = NULL
if(x[1] == "test") {
header = "kappa\t| rho\t| N"
vals = paste0(float(x[2]),"\t| ",float(x[4]),"\t| ",x[3])
}
else if(x[1] == "training" || x[1] == "secondRater") {
header = "kappa\t| N"
vals = paste0(float(x[2]),"\t| ",x[3])
}
writeLines(c(
"\nTest Set",
paste0(rep("-",width), collapse=""),
"kappa\t| rho\t| N",
paste0(rep("-",width), collapse=""),
test_vals,
paste0(rep("-",width), collapse=""),
header,
"\nTraining Set",
paste0(rep("-",width), collapse=""),
vals,
"kappa\t| N",
paste0(rep("-",width), collapse=""),
training_vals,
paste0(rep("-",width), collapse=""),
""
))
......@@ -208,9 +152,12 @@ print.summary.Test = function(x, ...) {
#' @return List of Code summary
#' @export
summary.Code = function( object, ... ) {
statsSummary = summary(object$statistics)
statsSummary <- NULL
if(length(object$statistics)) {
statsSummary <- summary(object$statistics)
}
this.summary = list(
this.summary <- list(
object$name,
object$codeSet$title,
object$definition,
......@@ -221,7 +168,7 @@ summary.Code = function( object, ... ) {
object$baserateInflation
)
class(this.summary) = "summary.Code"
class(this.summary) <- "summary.Code"
this.summary
}
......
# v0.1.3.1
# v0.2.0.0
Fixed bug in handcode() when there were newlines present
Restructed statistical history on codes
test() defaults kappa_threshold to 0.9
Codes now contain a holdout set and a set of touchable excerpts
Simplified retrieving of handsets for coding, along with randomizing the order
Bugfix: handcode() bug fixed that didn't show full excerpt when there was a newline
Bugfix: Fixed bug when retreiving differences in code sets using differences()
......@@ -6,21 +6,21 @@ rs = RS.data
newcode = create.code(name = "Data", expressions = c("number","data"), excerpts = rs$text)
# Handcode 4 excerpts from RSData
newcode.h = handcode(code = newcode, excerpts = rs$text, n = 4)
newcode = handcode(code = newcode, excerpts = rs$text, n = 4)
# Run test to see rho/kappa of current test set
newcode.t = test(code = newcode.h, kappaThreshold = 0.65)
newcode = test(code = newcode, kappa_threshold = 0.65)
# View the summary, with the calcuated statistics
summary(newcode.t)
summary(newcode)
# View the summary of just the test
summary(newcode.t$statistics)
summary(newcode$statistics)
# Resolve the differences in the test set
newcode.r = resolve(code = newcode.t)
newcode = resolve(code = newcode)
summary(newcode.r)
summary(newcode)
newcode.h2 = handcode(code = newcode.r, excerpts = rs$text, n = 10)
newcode.t2 = test(code = newcode.h2, kappaThreshold = 0.65)
......
\name{NEWS}
\title{News for Package \pkg{ncodeR}}
\section{Changes in nodeR version 0.1.3.1 (2019-06-10)}{
\section{Changes in nodeR version 0.2.0.0 (2019-10-09)}{
\itemize{
\item \code{handcode()} bug fixed that didn't show full excerpt when there was a newline.
\item Restructed statistical history on codes
\item \code{test()} defaults kappa_threshold to 0.9
\item Codes now contain a holdout set and a set of touchable excerpts.
\item Simplified retrieving of handsets for coding, along with randomizing the order
\item Bugfix: \code{handcode()} bug fixed that didn't show full excerpt when there was a newline.
\item Bugfix: Fixed bug when retreiving differences in code sets using \code{differences()}
}
}
......
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/nCoderR.R
% Please edit documentation in R/ncodeR.R
\name{ncodeR}
\alias{ncodeR}
\title{ncodeR for qualitative coding}
......
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/summary&print.R
\name{print.summary.Test}
\alias{print.summary.Test}
\title{Print a Test summary}
\usage{
\method{print}{summary.Test}(x, ...)
}
\arguments{
\item{x}{list from summary()}
\item{...}{Additional parameters}
}
\value{
prints summary
}
\description{
Print a Test summary
}
\examples{
data(RS.data)
rs = RS.data
newcode = create.code( name = "Data",
expressions = c("number","data"), excerpts = rs$text)
#newcode = handcode(code = newcode, excerpts = rs$text, n = 4)
newcode = test(code = newcode, kappa_threshold = 0.65)
summary(newcode$statistics)
}
......@@ -20,9 +20,9 @@ Print a TestList summary
\examples{
data(RS.data)
rs = RS.data
newcode = create.code( name = "Data",
expressions = c("number","data"), excerpts = rs$text)
#newcode = handcode(code = newcode, excerpts = rs$text, n = 4)
newcode <- create.code("Data", expressions = c("number","data"), excerpts = rs$text)
newcode <- handcode(newcode, this.set = 10:15, results = 0)
newcode = test(code = newcode, kappa_threshold = 0.65)
summary(newcode$statistics)
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/summary&print.R
\name{summary.Test}
\alias{summary.Test}
\title{Obtain a summary of a Code's test results}
\usage{
\method{summary}{Test}(object, ...)
}
\arguments{
\item{object}{Test object of Code}
\item{...}{Additional parameters}
}
\value{
list of Test summary
}
\description{
Obtain a summary of a Code's test results
}
\examples{
data(RS.data)
rs = RS.data
newcode = create.code(name = "Data",
expressions = c("number","data"), excerpts = rs$text)
#newcode = handcode(code = newcode, excerpts = rs$text, n = 4)
newcode = test(code = newcode, kappa_threshold = 0.65)
summary(newcode$statistics)
}
......@@ -22,7 +22,7 @@ data(RS.data)
rs = RS.data
newcode = create.code(name = "Data",
expressions = c("number","data"), excerpts = rs$text)
#newcode = handcode(code = newcode, excerpts = rs$text, n = 4)
newcode <- handcode(newcode, this.set = 10:15, results = 0)
newcode = test(code = newcode, kappa_threshold = 0.65)
summary(newcode$statistics)
}
......@@ -4,7 +4,7 @@
\alias{test}
\title{Title}
\usage{
test(code, kappa_threshold = 0.65, baserate_inflation = 0.2)
test(code, kappa_threshold = 0.65, baserate_inflation = 0.2, ...)
}
\arguments{
\item{code}{[TBD]}
......@@ -12,6 +12,8 @@ test(code, kappa_threshold = 0.65, baserate_inflation = 0.2)
\item{kappa_threshold}{[TBD]}
\item{baserate_inflation}{[TBD]}
\item{...}{[TBD]}
}
\value{
code object
......
data(RS.data)
rs_text = RS.data$text
test_that("Verify statistics output", {
testthat::test_that("Verify old parameter", {
name <- "Data"
set <- 10:15
exprs <- c("number","priority")
coded <- sapply(rs_text[set], grepl, pattern = paste0(exprs, collapse = "|")) * 1
newcode <- create.code(name, expressions = exprs, excerpts = rs_text) %>%
handcode(this.set = set, results = 0)
testthat::expect_warning(test(code = newcode, kappaThreshold = 0.9))
})
testthat::test_that("Verify statistics output", {
name <- "Data"
set <- 10:15
exprs <- c("number","priority")
......@@ -11,18 +22,17 @@ test_that("Verify statistics output", {
handcode(this.set = set, results = 0) %>%
test()
expect_equal(length(newcode$statistics), 1)
testthat::expect_equal(length(newcode$statistics), 1)
first_stats <- newcode$statistics[[1]]
expect_equal(first_stats$one_v_classifier$test_set$N, length(set))