Verified Commit aca087f7 authored by Cody L Marquart's avatar Cody L Marquart
Browse files

Merge branch 'develop'

parents 752fb7e0 bd22b541
Pipeline #46368 failed with stages
in 3 minutes and 33 seconds
......@@ -3,4 +3,5 @@
^cran-comments\.md$
demo/app/
demo/backup/
^.gitlab-ci.yml
\ No newline at end of file
^.gitlab-ci.yml
docs/
\ No newline at end of file
.Rproj.user
.Rhistory
.RData
inst/doc
\.DS_Store
inst/coverage/
docs/
stages:
- docker
- check
- test
- docs
- checkWindows
- release
......@@ -8,31 +10,37 @@ variables:
VERSION: "0.1.3.1"
PKG: "ncodeR"
PKGL: "ncoder"
REG_URL: "registry.doit.wisc.edu/epistemic-analytics/qe-packages/ncoder"
REG_URL: "${CI_REGISTRY}/epistemic-analytics/qe-packages/ncoder"
# Templates
.build_definition: &build_template
before_script:
- export _R_CHECK_CRAN_INCOMING_=FALSE
- export _R_CHECK_FORCE_SUGGESTS_=TRUE
- mkdir -p build/
- mkdir -p archive/
- Rscript -e 'update(devtools::dev_package_deps("."))'
- Rscript -e 'install.packages("knitr")'
- Rscript -e 'install.packages("rmarkdown")'
- Rscript -e 'install.packages("testthat")'
- cd ..; R CMD build --resave-data --no-manual --log $PKG || true
.check_definition: &check_template
stage: check
artifacts:
paths:
- build/
- archive/
- doc/
- inst/doc/
expire_in: 1 week
<<: *build_template
script:
- R --no-site-file --no-environ --no-save --no-restore --quiet CMD check ${PKG}_${VERSION}.tar.gz --as-cran --no-manual
- ls -al ./
- ls -al ./$PKG.Rcheck/
- mv ./$PKG-00build.log $PKG/build/ || true
- mv ./$PKG.Rcheck/00check.log $PKG/build/ || true
- mv ./$PKG.Rcheck/00install.out $PKG/build/ || true
- mv ./${PKG}_${VERSION}.tar.gz $PKG/build/
- mv ./$PKG-00build.log $PKG/archive/ || true
- mv ./$PKG.Rcheck/00check.log $PKG/archive/ || true
- mv ./$PKG.Rcheck/00install.out $PKG/archive/ || true
- mv ./${PKG}_${VERSION}.tar.gz $PKG/archive/
tags:
- docker
allow_failure: false
......@@ -62,7 +70,7 @@ check:r-winbuilder:
- curl -v -T ${PKG}_${VERSION}.tar.gz ftp://win-builder.r-project.org/R-oldrelease/${PKG}_${VERSION}.tar.gz
- curl -v -T ${PKG}_${VERSION}.tar.gz ftp://win-builder.r-project.org/R-release/${PKG}_${VERSION}.tar.gz
- curl -v -T ${PKG}_${VERSION}.tar.gz ftp://win-builder.r-project.org/R-devel/${PKG}_${VERSION}.tar.gz
- mv ./${PKG}_${VERSION}.tar.gz $PKG/build/
- mv ./${PKG}_${VERSION}.tar.gz $PKG/archive/
tags:
- docker
allow_failure: false
......@@ -83,33 +91,85 @@ release:cran:
tags:
- docker
allow_failure: false
test:r-release:
stage: test
image: $REG_URL/release-test:current
script:
- 'Rscript -e "Rcpp::compileAttributes()"'
- 'Rscript -e "rep <- covr::report( x = covr::package_coverage(quiet = T), file = \"inst/coverage/${CI_COMMIT_REF_NAME}/index.html\" )"'
- 'Rscript -e "setwd(\"inst/coverage/${CI_COMMIT_REF_NAME}\"); res = lapply(dir(\".\", full.names = F, recursive = T), function(f) { aws.s3::put_object(file = f, object = paste0(\"coverage/${CI_COMMIT_REF_NAME}/\", f), bucket = \"${PKGL}.qe-libs.org\", acl = \"public-read\") })"'
- 'Rscript --vanilla demo/backup/coverage-report.R ${CI_COMMIT_REF_NAME}'
dependencies:
- check:r-release
docs:build-site:
stage: docs
image: $REG_URL/release-test:current
script:
- rm -rf src/RcppExports*
- rm -rf R/RcppExports*
- 'Rscript -e "Rcpp::compileAttributes()"'
- 'Rscript -e "install.packages(\"fs\")"'
- 'Rscript -e "install.packages(\"pkgdown\")"'
- 'Rscript -e "devtools::install(\".\", quick = T, build = T, dependencies = F, upgrade = F)"'
- 'Rscript -e "pkgdown::build_site(preview = F, document = F, lazy = T, new_process = T)"'
- 'Rscript -e "setwd(\"docs\"); res = lapply(dir(\".\", full.names = F, recursive = T), function(f) { aws.s3::put_object(file = f, object = f, bucket = \"${PKGL}.qe-libs.org\", acl = \"public-read\") })"'
dependencies:
- test:r-release
.docker_definition: &docker_template
stage: docker
tags:
- shell
image: docker:19.03.1
services:
- docker:19.03.1-dind
before_script:
- docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
script:
- docker build -t $REG_URL/$TAG:current -f inst/docker/$PKG-$FILE inst/docker/
- docker push $REG_URL/$TAG:current
only:
changes:
- ".gitlab-ci.yml"
- "inst/docker/$PKG-$FILE"
docker:release:
variables:
TAG: "release"
FILE: "release"
DOCKER_HOST: tcp://localhost:2375/
DOCKER_DRIVER: overlay2
DOCKER_TLS_CERTDIR: ""
only:
changes:
- "inst/docker/$PKG-$FILE"
<<: *docker_template
docker:release-test:
variables:
TAG: "release-test"
FILE: "release-test"
DOCKER_HOST: tcp://localhost:2375/
DOCKER_DRIVER: overlay2
DOCKER_TLS_CERTDIR: ""
only:
changes:
#- ".gitlab-ci.yml"
- "inst/docker/$PKG-$FILE"
<<: *docker_template
docker:old-release:
variables:
TAG: "oldrelease"
FILE: "oldrelease"
DOCKER_HOST: tcp://localhost:2375/
DOCKER_DRIVER: overlay2
DOCKER_TLS_CERTDIR: ""
only:
changes:
- "inst/docker/$PKG-$FILE"
<<: *docker_template
docker:devel:
variables:
TAG: "devel"
FILE: "devel"
DOCKER_HOST: tcp://localhost:2375/
DOCKER_DRIVER: overlay2
DOCKER_TLS_CERTDIR: ""
only:
changes:
- "inst/docker/$PKG-$FILE"
<<: *docker_template
\ No newline at end of file
......@@ -2,11 +2,12 @@ Package: ncodeR
Title: Techniques for Automated Classifiers
Type: Package
Author: Cody L Marquart [aut, cre], Zachari Swiecki [aut], Brendan Eagan [aut], David Williamson Shaffer [aut]
Authors@R: c(person("Cody L","Marquart", role = c("aut", "cre"), email="cody.marquart@wisc.edu"),person("Zachari","Swiecki", role = c("aut"), email="swiecki@wisc.edu"),person("Brendan","Eagan", role = c("aut"), email="beagan@wisc.edu"),person("David", "Williamson Shaffer", role = c("aut"), email = "dws@education.wisc.edu"))
Authors@R: c(person("Cody L","Marquart", role = c("aut", "cre"), email="cody.marquart@wisc.edu", comment = c(ORCID = "0000-0002-3387-6792")),person("Zachari","Swiecki", role = c("aut"), email="swiecki@wisc.edu"),person("Brendan","Eagan", role = c("aut"), email="beagan@wisc.edu"),person("David", "Williamson Shaffer", role = c("aut"), email = "dws@education.wisc.edu"))
Maintainer: Cody L Marquart <cody.marquart@wisc.edu>
Version: 0.1.3.1
Description: A set of techniques that can be used to develop, validate, and implement automated classifiers. A powerful tool for transforming raw data into meaningful information, 'ncodeR' (Shaffer, D. W. (2017) Quantitative Ethnography. ISBN: 0578191687) is designed specifically for working with big data: large document collections, logfiles, and other text data.
LazyData: TRUE
BugReports: https://gitlab.com/epistemic-analytics/qe-packages/ncoder/issues
Depends:
R (>= 3.0.0)
License: GPL-3 | file LICENSE
......@@ -15,5 +16,8 @@ Imports:
rhoR,
cli
Suggests:
testthat
RoxygenNote: 6.1.0
testthat,
knitr,
rmarkdown
RoxygenNote: 6.1.1
VignetteBuilder: knitr
......@@ -16,9 +16,12 @@ export(autocode)
export(code.set)
export(create.code)
export(differences)
export(expression.match)
export(getHandSetIndices)
export(getHandSetIndices2)
export(handcode)
export(ncode)
export(old_test)
export(resolve)
export(test)
import(R6)
......
......@@ -46,6 +46,9 @@ Code = R6::R6Class("Code",
definition = NULL,
codeSet = NULL,
excerpts = NULL,
holdoutExcerpts = NULL,
touchableExcerpts = NULL,
touchedIndices = NULL,
testSet = data.frame(), #matrix, columns: ID, R1training, R1test (with potential to add columns for other raters)
trainingSet = data.frame(),
......@@ -53,7 +56,7 @@ Code = R6::R6Class("Code",
ignoredSet = data.frame(),
secondRaterSet = data.frame(),
examples = NULL,
statistics = list(testSet = list(), trainingSet = list(), secondRaterSet = list()),
statistics = list(),
baserateInflation = NA,
baserate = NA,
......@@ -70,24 +73,34 @@ Code = R6::R6Class("Code",
ignoredSet = c(),
examples = NULL,
excerpts = NULL,
holdoutSize = 0.9,
...
){
codeSet = NULL;
if(class(name) != "character"){
stop("name must be a string");
}
# if(class(definition) != "character"){
# stop("Conceptual definition must be a string");
# }
if(!is.null(excerpts)) {
self$excerpts = excerpts;
# codeSet = CodeSet$new(title = "NewCodeSet", description = "New CodeSet for Codes", codes = c(self))
} else if(!is.null(codeSet)) {
}
else if(!is.null(codeSet)) {
# codeSet$codes = c(codeSet$codes, self)
if(!is.null(codeSet$excerpts)) {
self$excerpts = codeSet$excerpts
}
}
}
# browser()
# self$touchableExcerpts = sample(seq(along=excerpts), size = ifelse(length(excerpts)>=holdoutSize, holdoutSize, length(excerpts)))
# self$holdoutExcerpts = seq_along(excerpts)[!(seq_along(excerpts) %in% self$touchableExcerpts)]
holdoutN = ceiling(length(excerpts) * holdoutSize)
self$holdoutExcerpts = sample(
seq(along=excerpts),
size = holdoutN
)
self$touchableExcerpts = seq_along(excerpts)[!(seq_along(excerpts) %in% self$holdoutExcerpts)]
# length(excerpts) - length(self$holdoutExcerpts)
args = list(...);
......@@ -111,7 +124,9 @@ Code = R6::R6Class("Code",
colnames(self$secondRaterSet) = c("ID", "X1");
}
if(is.null(computerSet)) {
self$computerSet = rep(NA, length(codeSet$excerpts));
# self$computerSet = rep(NA, length(codeSet$excerpts));
self$computerSet = matrix(ncol = 2, nrow = 0);
colnames(self$computerSet) = c("ID", "X1");
}
self$ignoredSet = ignoredSet;
......
......@@ -29,69 +29,89 @@ RegexCode = R6::R6Class("RegexCode",
###
# Main class constructor
###
initialize = function(
name,
definition,
...,
excerpts = NULL,
expressions = NULL
){
super$initialize(name = name, definition = definition, excerpts = excerpts, ...)
if((!is.null(expressions)) && (class(expressions) != "character")){
stop("expressions must be a vector of strings");
}
self$expressions = expressions;
},
######
initialize = function(
name,
definition,
...,
excerpts = NULL,
expressions = NULL
){
super$initialize(name = name, definition = definition, excerpts = excerpts, ...)
if((!is.null(expressions)) && (class(expressions) != "character")){
stop("expressions must be a vector of strings");
}
self$expressions = expressions;
},
#####
process = function(excerpts = self$excerpts) {
expression.match(excerpts, self$expressions, names = list(NULL, self$name))
},
###
# Process function to override that on Code
#####
process = function(excerpts = self$excerpts) {
expression.match(excerpts, self$expressions, names = list(NULL, self$name))
},
#####
###
# Adds a new expression to the code's list
###
add = function(
word
){
if(class(word) != "character"){
stop("word must be a string");
}
self$expressions = c(self$expressions, word);
###DOES THIS AFFECT TRAINING/TEST SETS?
},
#####
add = function(
word
){
if(class(word) != "character"){
stop("word must be a string");
}
self$expressions = c(self$expressions, word);
###DOES THIS AFFECT TRAINING/TEST SETS?
},
#####
###
# Removes an expression from the code's list
#####
remove = function(
word
){
if(class(word) != "character"){
stop("word must be a string");
}
index = which(self$expressions == word)
if(length(index) == 0){
stop(paste("\"", word, "\" does not exist in the expressions list", sep = ""));
}else{
self$expressions = self$expressions[-index];
}
###DOES THIS AFFECT TRAINING/TEST SETS?
},
#####
###
remove = function(
word
){
if(class(word) != "character"){
stop("word must be a string");
}
index = which(self$expressions == word)
if(length(index) == 0){
stop(paste("\"", word, "\" does not exist in the expressions list", sep = ""));
}else{
self$expressions = self$expressions[-index];
# Concatenate expressions list as single regular expression
#####
concat = function(){
return (paste(self$expressions, collapse="|"));
}
###DOES THIS AFFECT TRAINING/TEST SETS?
},
concat = function(){
return (paste(self$expressions, collapse="|"));
}
#####
),
private = list()
)
#' Expression Matching
#'
#' @description Match a set of text excerpts against a set of regular expressions
#' @param excerpts Character vector to match against
#' @param expressions Character vector of expressions
#' @param names Character vector to use for dimension names
#'
#' @return Matrix representing matched expressions
#' @export
expression.match <- function(excerpts, expressions, names = list(NULL, "V1")) {
matrix(
unlist(lapply(excerpts, function(x) {
......
calc_statistics <- function(set, kappa_threshold = 0.65, baserate_inflation = 0.2) {
stats = list( rho = NA, kappa = NA )
if( nrow(set) > 0 ) {
stats <- tryCatch(
rhoR::rhoSet(
set[, -1],
ScSKappaThreshold = kappa_threshold,
testSetBaserateInflation = baserate_inflation
),
error = function(x) {
warning("Unable to calculate valid rho, returning NA with kappa")
list(rho = NA, kappa = rhoR::kappaSet(set[, -1]))
}
)
}
stats$N <- nrow(set)
stats$set <- set
stats
}
#' Title
#'
#' @param code [TBD]
#' @param kappa_threshold [TBD]
#' @param baserate_inflation [TBD]
#'
#' @return code object
#' @export
test <- function(code, kappa_threshold = 0.65, baserate_inflation = 0.2) {
code.to.use <- code$clone(deep=T);
code.to.use$baserateInflation <- baserate_inflation;
first_v_classifier_test <- NULL
first_v_classifier_train <- NULL
first_v_second <- NULL
second_v_classifier <- NULL
if(nrow(code$testSet) > 0) {
to.test <- merge(code.to.use$computerSet, code.to.use$testSet, by = 1)
if(!any(is.na(to.test[, 2:3])))
first_v_classifier_test = calc_statistics(to.test)
}
if(nrow(code.to.use$trainingSet) > 0) {
to.test = merge(code.to.use$computerSet, code.to.use$trainingSet, by = 1)
to.test[is.na(to.test[,3]), 3] = abs(to.test[is.na(to.test[,3]), 2] - 1)
to.test[!to.test[,3] %in% 0:1, 3] <- abs(to.test[!to.test[,3] %in% 0:1, 2] - 1)
first_v_classifier_train = calc_statistics(to.test)
}
first_v_classifier <- test_result(
test_set = first_v_classifier_test,
training_set = first_v_classifier_train
)
if(nrow(code.to.use$secondRaterSet) > 0) {
first_set <- rbind(code.to.use$testSet, code.to.use$trainingSet)
to.test <- merge(first_set, code.to.use$secondRaterSet, by = 1, all = TRUE)
to.test <- to.test[rowSums(!is.na(to.test[,2:3]) * 1) > 1, ]
first_v_second <- calc_statistics(to.test)
to.test <- merge(code.to.use$computerSet, code.to.use$secondRaterSet, by = 1, all = TRUE)
to.test <- to.test[rowSums(!is.na(to.test[,2:3]) * 1) > 1, ]
second_v_classifier <- calc_statistics(to.test)
}
new_tests <- list(
one_v_classifier = first_v_classifier,
one_v_two = test_result(test_set = first_v_second),
two_v_classifier = test_result(test_set = second_v_classifier)
)
code.to.use$setValue("testedTestSet", T)
code.to.use$statistics[[length(code.to.use$statistics) + 1]] <- new_tests
code.to.use
}
#' @title Calculate statistics
#' @description Run tests (kappa, rho) on the given Code
#'
......@@ -8,7 +87,7 @@
#'
#' @return Code object with updated statistics property
#' @export
test <- function(code, kappaThreshold = 0.65, baserateInflation = 0.2, type = c("training", "test")) {
old_test <- function(code, kappaThreshold = 0.65, baserateInflation = 0.2, type = c("training", "test")) {
if(is.null(code) || !is(object=code, class=c("Code")))
stop("Supplied `code` must be an instance of `Code`")
......@@ -33,7 +112,7 @@ test <- function(code, kappaThreshold = 0.65, baserateInflation = 0.2, type = c(
statResults = list();
if("training" %in% type) {
to.train = getSet("training");
to.train = merge.sets(code.to.use, "trainingSet")[,-c(1)]
trainKappa = list(kappa=NA)
trainN = nrow(to.train)
if(trainN>0) {
......@@ -52,10 +131,12 @@ test <- function(code, kappaThreshold = 0.65, baserateInflation = 0.2, type = c(
kappa = kappaThreshold
)
)
code.to.use$statistics$trainingSet[[length(code.to.use$statistics$trainingSet)+1]] = results
# code.to.use$statistics$trainingSet[[length(code.to.use$statistics$trainingSet)+1]] = results
code.to.use$statistics$trainingSet = c(results, code.to.use$statistics$trainingSet)
}
if("test" %in% type) {
to.test = getSet("test");
# to.test = getSet("test");
to.test = merge.sets(code.to.use, "testSet")[,-c(1)]
testRho = list(rho=NA, kappa=NA)
testN = nrow(to.test)
if(testN>0) {
......@@ -76,7 +157,8 @@ test <- function(code, kappaThreshold = 0.65, baserateInflation = 0.2, type = c(
kappa = kappaThreshold
)
)
code.to.use$statistics$testSet[[length(code.to.use$statistics$testSet)+1]] = results
# code.to.use$statistics$testSet[[length(code.to.use$statistics$testSet)+1]] = results
code.to.use$statistics$testSet = c(results, code.to.use$statistics$testSet)
code.to.use$setValue("testedTestSet", T)
}
if("second" %in% type) {
......@@ -96,7 +178,8 @@ test <- function(code, kappaThreshold = 0.65, baserateInflation = 0.2, type = c(
kappa = kappaThreshold
)
)
code.to.use$statistics$secondRaterSet[[length(code.to.use$statistics$secondRaterSet)+1]] = results
# code.to.use$statistics$secondRaterSet[[length(code.to.use$statistics$secondRaterSet)+1]] = results;
code.to.use$statistics$secondRaterSet = c(results, code.to.use$statistics$secondRaterSet);
}
# testResult = Test$new(list(
......@@ -114,47 +197,60 @@ test <- function(code, kappaThreshold = 0.65, baserateInflation = 0.2, type = c(
code.to.use
}
Test = R6::R6Class("Test",
public = list(
baserate = NULL,
baserateInflation = NULL,
# testSet = list(),
# trainingSet = list(),
# secondRaterSet = list(),
thresholds = list(),
set = list(),
code = NULL,
rho = NULL,
kappa = NULL,
N = NULL,
excerpts = NULL,
###
# Main class constructor
###
initialize = function(stats,baserate,inflation,thresholds,code = NULL) {
# if(!all(type %in% c("training", "test", "second"))) {
# warning("Type must be one or more of: c('training', 'test', 'second').")
# } else {
self$thresholds = thresholds; #stats$thresholds
self$baserate = baserate;
self$baserateInflation = inflation;
lapply(names(stats), function(n) { self[[n]] = stats[[n]] })
# self$trainingSet = stats$results$trainingSet
# self$testSet = stats$results$testSet
# self$secondRaterSet = stats$results$secondRaterSet
if(!is.null(code)) {
self$code = code$clone(deep=T);
}
# }
},
print = function() {
print(sapply(ls(self)[sapply(ls(self), function(x) !is.function(self[[x]]))], function(x) self[[x]]))
}
),
private = list(
#####
# Test Class
#####
Test = R6::R6Class("Test",
public = list(
baserate = NULL,
baserateInflation = NULL,
# testSet = list(),
# trainingSet = list(),
# secondRaterSet = list(),
thresholds = list(),
set = list(),