Verified Commit 57679904 authored by Cody L Marquart's avatar Cody L Marquart
Browse files

unseen words

#13
parent 6b3761f5
Pipeline #53287 failed with stages
in 3 minutes and 22 seconds
......@@ -70,8 +70,9 @@ check:r-release:
except:
refs:
- tags
only:
variables:
- $CI_COMMIT_MESSAGE =~ /.*\[skip winbuilder\].*/
- $CI_COMMIT_MESSAGE =~ /.*\[check winbuilder\].*/
check:r-winbuilder:
stage: check
image: $REG_URL/release-test:current
......@@ -90,8 +91,9 @@ check:r-winbuilder:
except:
refs:
- tags
only:
variables:
- $CI_COMMIT_MESSAGE =~ /.*\[skip rhub\].*/
- $CI_COMMIT_MESSAGE =~ /.*\[check rhub\].*/
.rhub_definition: &rhub_definition
stage: check
image: $REG_URL/release:current
......
......@@ -20,4 +20,4 @@ Suggests:
magrittr,
knitr,
rmarkdown
RoxygenNote: 6.1.1
RoxygenNote: 7.0.2
......@@ -22,6 +22,7 @@ export(ncode)
export(old_test)
export(resolve)
export(test)
export(unseen_words)
import(R6)
import(cli)
import(rhoR)
No preview for this file type
......@@ -49,7 +49,8 @@ Code = R6::R6Class("Code",
holdoutExcerpts = NULL,
touchableExcerpts = NULL,
touchedIndices = NULL,
additionalExcerpts = NULL,
testSet = data.frame(), #matrix, columns: ID, R1training, R1test (with potential to add columns for other raters)
trainingSet = data.frame(),
computerSet = data.frame(),
......
......@@ -46,15 +46,20 @@ test <- function(code, kappa_threshold = 0.65, baserate_inflation = 0.2, ...) {
second_v_classifier <- NULL
first_v_second_train <- NULL
second_v_classifier_train <- NULL
# browser()
if(nrow(code$testSet) > 0) {
to.test <- merge(code.to.use$computerSet, code.to.use$testSet, by = 1)
to.test <- to.test[!to.test$ID %in% code.to.use$additionalExcerpts,]
if(!any(is.na(to.test[, 2:3])))
first_v_classifier_test = calc_statistics(to.test)
}
if(nrow(code.to.use$trainingSet) > 0) {
if(nrow(code.to.use$trainingSet) > 0 || length(code.to.use$additionalExcerpts)) {
# browser()
to.test = merge(code.to.use$computerSet, code.to.use$trainingSet, by = 1)
# to.test <- rbind(to.test, code.to.use$testSet[!to.test$ID %in% code.to.use$additionalExcerpts,]
to.test[is.na(to.test[,3]), 3] = abs(to.test[is.na(to.test[,3]), 2] - 1)
to.test[!to.test[,3] %in% 0:1, 3] <- abs(to.test[!to.test[,3] %in% 0:1, 2] - 1)
first_v_classifier_train = calc_statistics(to.test)
......
......@@ -46,7 +46,7 @@ getHandSetIndices2 = function(
code,
handSetLength = 20,
handSetBaserate = .2,
unseen = F,
unseen = FALSE,
this.set = NULL
) {
codeToUse = code$clone(deep = T)
......@@ -104,21 +104,24 @@ getHandSetIndices2 = function(
}
}
if(unseen == T) {
warning("Unseen excerpts is not yet implemented.")
}
additional_excerpts = NULL;
# if(unseen == TRUE) {
# message("Unseen excerpts is in beta and still being tested")
# codeToUse$additionalExcerpts <- unseen_words(codeToUse);
# # browser()
# codeToUse$computerSet = rbind(
# codeToUse$computerSet,
# cbind(codeToUse$additionalExcerpts, codeToUse$process(excerpts = codeToUse$excerpts[codeToUse$additionalExcerpts]))
# );
# # this.set = c(this.set, codeToUse$additionalExcerpts);
# }
# randomize output order of indices
# codeToUse$testSet = rbind(
# codeToUse$testSet,
# #data.frame(ID=this.set[sample.int(n = handSetLength)], X1=NA)
# data.frame(ID=codeToUse$computerSet[,1][sample.int(n = handSetLength)], X1=NA)
# )
codeToUse$testSet = rbind(
codeToUse$testSet,
matrix(
c(this.set[sample.int(n = handSetLength)], rep(NA, handSetLength)),
ncol = 2, nrow = handSetLength,
c(this.set[sample.int(n = length(this.set))], rep(NA, length(this.set))),
ncol = 2, nrow = length(this.set),
byrow = F, dimnames = list(NULL, colnames(codeToUse$testSet))
)
)
......@@ -133,6 +136,15 @@ getHandSetIndices2 = function(
data.frame(ID=this.set, X1=NA)
)
}
if(unseen == TRUE) {
message("Unseen excerpts is in beta and still being tested")
codeToUse$additionalExcerpts <- unseen_words(codeToUse);
newRows <- data.frame(cbind(ID = codeToUse$additionalExcerpts, codeToUse$process(excerpts = codeToUse$excerpts[codeToUse$additionalExcerpts])))
colnames(newRows) <- colnames(codeToUse$computerSet)
codeToUse$computerSet = rbind(codeToUse$computerSet, newRows);
# this.set = c(this.set, codeToUse$additionalExcerpts);
}
return(codeToUse);
}
\ No newline at end of file
......@@ -20,7 +20,7 @@ handcode = function(
expressions = NULL,
n = ifelse(is.null(this.set), 10, length(this.set)),
baserate = 0.2,
unseen = F,
unseen = FALSE,
this.set = NULL,
results = NULL
) {
......@@ -71,7 +71,7 @@ handcode = function(
#code.to.use = autocode(x = code.to.use, expressions = code.to.use$expressions, excerpts = code.to.use$excerpts, simplify=F)
# code.to.use = getHandSetIndices2(code.to.use, handSetLength = n, handSetBaserate = baserate, unseen = unseen)
code.to.use = getHandSetIndices2(
code = code.to.use,
handSetLength = n,
......@@ -79,24 +79,14 @@ handcode = function(
unseen = unseen,
this.set = this.set
)
indices = code.to.use$testSet[,1]
## New handset code
# indices = getHandSetIndices2(code.to.use, handSetLength = n, handSetBaserate = baserate, unseen = unseen)
# code.to.use$computerSet = code.to.use$process(code.to.use$excerpts[indices])
indices = c(code.to.use$testSet[is.na(code.to.use$testSet[,2]), 1], code.to.use$additionalExcerpts);
indices = sample(indices, size = length(indices))
# selfCodes = c()
# for(i in 1:length(excerpts)) {
# index = i;
# print(boxx(c(strwrap(as.character(excerpts[index]))), width=50))
# uin = readline("Enter (y/n): ");
# selfCodes = c(selfCodes, grepl(x=tolower(uin),pattern="^y",perl=T)*1)
# }
len = length(indices)
coding = TRUE;
recoding = FALSE;
numberToReview = -1;
if(is.null(results)) {
while(coding) {
selfCodes = matrix(c(indices, sapply(indices, function(index) {
......@@ -108,7 +98,7 @@ handcode = function(
print(cli::boxx(c(
paste0("ID: ", index),
paste0("Excerpt #",indexCount),
paste0("Excerpt #",indexCount, " of ", length(indices)),
paste0("Definition: ", code$definition),
"",
strwrap(as.character(sub(pattern = "\n", replacement = " ", perl = T, x = excerpts[index])))
......@@ -143,22 +133,29 @@ handcode = function(
}
}
else {
selfCodes = matrix(c(
this.set,
rep(results, length(this.set) / length(results))
), ncol = 2)
if(length(results) == length(c(this.set, code.to.use$additionalExcerpts))) {
selfCodes = matrix(c(this.set, code.to.use$additionalExcerpts,results), ncol = 2, dimnames = list(NULL, c("ID", code.to.use$name)))
}
else {
selfCodes = matrix(c(
this.set,
rep(results, length(this.set) / length(results))
), ncol = 2, dimnames = list(NULL, c("ID", code.to.use$name)))
}
}
# browser()
selfCoded_in_test <- which(!selfCodes[, 1] %in% code.to.use$additionalExcerpts);
selfCoded_in_test_IDs <- selfCodes[selfCoded_in_test, 1];
selfCoded_in_addl <- which(selfCodes[, 1] %in% code.to.use$additionalExcerpts);
# code.to.use$testSet = rbind(code.to.use$testSet, selfCodes)
code.to.use$testSet[code.to.use$testSet[,1] == selfCodes[,1] ,2] = selfCodes[, 2]
# code.to.use$testSet[code.to.use$testSet[,1] %in% selfCodes[,1], 2] = selfCodes[selfCoded_in_test, 2]
sapply(selfCoded_in_test_IDs, function(id) {
code.to.use$testSet[code.to.use$testSet[,1] == id, 2] = selfCodes[selfCodes[,1] == id, 2]
})
# if(!is.null(expressions)) {
# autoCodes = code(expressions, excerpts)
# retMat = matrix(c(selfCodes, autoCodes), ncol= 2, dimnames=list(NULL,c("self","computer")))
# return(retMat);
# } else {
# return(selfCodes);
# }
if(length(selfCoded_in_addl) > 0) {
code.to.use$trainingSet = rbind(code.to.use$trainingSet, selfCodes[selfCoded_in_addl,])
}
code.to.use
}
\ No newline at end of file
create_tm = function(text){
text <- as.character(text);
corpus <- tm::VCorpus(tm::VectorSource(text));
terms <- tm::TermDocumentMatrix(
corpus,
control = list(wordLengths=c(1, Inf),
removePunctuation = TRUE,
removeNumbers = TRUE)
);
return(terms);
}
#' Unseen Words
#'
#' @param code Code object
#'
#' @return Excerpts that contain unseen words
#'
#' @export
unseen_words <- function(code) {
text <- code$excerpts
words <- code$expressions
coded <- code$testSet[, 1];
hcy.index <- code$testSet[code$testSet[, 2] == 1, 1]
hcn.index <- code$testSet[code$testSet[, 2] == 0, 1]
hcn.excerpts <- text[hcn.index];
touchedIndices.c <- code$touchedIndices;
touchableIndices.c <- code$touchableExcerpts;
prioritized.indices <- c(hcy.index, touchedIndices.c, touchableIndices.c);
prioritized.excerpts <- text[prioritized.indices];
#create a prioritized vocab matrix
prioritized_tm <- create_tm(prioritized.excerpts);
prioritized.df <- as.data.frame(as.matrix(prioritized_tm));
#create a word document matrix for handcode no exerpts
hcn.terms <- create_tm(hcn.excerpts);
filter_words <- c(hcn.terms$dimnames$Terms, words);
unseen_set <- prioritized.df[!row.names(prioritized.df) %in% filter_words,];
ordered_summed <- order(colSums(unseen_set), decreasing = TRUE);
head(ordered_summed[!ordered_summed %in% coded], 2);
}
\ No newline at end of file
# v0.2.0.1
# v0.2.1.0
Features:
* Implemented the `unseen` paramater in `handcode()`. Adds an additional
two excerpts to test set that contain the highest occurrence of words
unseen by the coder
Bug fixes:
* Missing precision and recall calculation with rho/Kappa
* Fixed output from autocode()
\ No newline at end of file
*
\ No newline at end of file
This diff is collapsed.
# This is a generalized R script for using ncodeR
# Version 2 (2018-08-13 Updated)
## Changelog:
## 1. Included function diffResolve() to resolve coding difference between any two parties
## 2. Added lines for exporting R object "newcode.t" (useful for autocoding all data at the end)
## 3. Clarified some comments and instructions
## 4. Fixed an issue where require(ncodeR) package may fail
################################################################################
# Look for this symbol *** for parts that need to be changed for your data set #
################################################################################
########################################################################################################
# Remeber to set your working directory to where the ncodeR package "ncodeR_0.1.0.tar.gz" is stored at #
# (Session -> Set Working Directory -> Choose Directory...) #
# And put your data file in the same working directory #
########################################################################################################
rm(list = ls())
gc()
# Set environment ----
if (!require(rhoR)) {
install.packages("rhoR")
require(rhoR)
}
if (!require(ncodeR)) {
install.packages("ncodeR")
require(ncodeR)
}
if (!require(dplyr)) {
install.packages("dplyr")
require(dplyr)
}
if (!require(openxlsx)) {
install.packages("openxlsx")
require(openxlsx)
}
if (!require(R6)) {
install.packages("R6")
require(R6)
}
if (!require(cli)) {
install.packages("cli")
require(cli)
}
if (!require(tm)) {
install.packages("tm")
require(tm)
}
# --------------------------------------------------------------------------------------------------------------------------
# --------------------------------------------------------------------------------------------------------------------------
# --------------------------------------------------------------------------------------------------------------------------
#gethandsetindices2
getHandSetIndices2 = function(codeToUse, handSetLength = 20, handSetBaserate = .2, unseen = F) {
positives = ceiling(handSetLength * handSetBaserate); # minimum number of positive indices needed
maxNonPositives = handSetLength - positives; # maximum number of non-positive indices allowed
this.set = NULL; # set of indices to be returned
# create data frame same length as excerpts -- used for keeping track of indices sampled
indices = c(1:length(codeToUse$excerpts))
# set touchedIndices, testSet, and trainingSet to NA - ensure they won't be sampled again
indices[unlist(codeToUse$touchedIndices, recursive = TRUE, use.names = TRUE)] = NA
indices[unlist(codeToUse$testSet, recursive = TRUE, use.names = TRUE)] = NA
indices[unlist(codeToUse$trainingSet, recursive = TRUE, use.names = TRUE)] = NA
# get miniumum number of positive indices necessary
while(positives > 0){
if(sum(is.na(indices)) == length(codeToUse$excerpts) ){ # not enough positives to fill baserate
stop("Not enough positives in first rater to inflate to this level")
}
randIndice = sample(indices[!is.na(indices)],1) # get a random indice
indices[randIndice] = NA # set this indice to be NA, won't be sampled again
# autocode excerpt at this single indice
if(codeToUse$process(codeToUse$excerpts[randIndice]) == 1){
positives = positives - 1;
this.set = c(this.set, randIndice) # only adding positive indices to handset
}
else{ # deal with nonPositive excerpt case
if(maxNonPositives != 0){ # room in handSet to add nonPositive -> add it
this.set = c(this.set, randIndice)
maxNonPositives = maxNonPositives - 1;
}
else{ # if max number of nonPositve indices is reached, save indice, but don't add to set
codeToUse$touchedIndices = c(codeToUse$touchedIndices, randIndice)
}
}
}
# fill up the rest of handset with random indices, positive or not
if(maxNonPositives!= 0){
for(i in 1:maxNonPositives){
randIndice = sample(indices[!is.na(indices)],1)
indices[randIndice] = NA
this.set = c(this.set, randIndice)
}
}
if(unseen == T) {
print("Unseen excerpts is not yet implemented.")
}
return(sample(this.set)); # randomize output order of indices
}
# --------------------------------------------------------------------------------------------------------------------------
# --------------------------------------------------------------------------------------------------------------------------
# --------------------------------------------------------------------------------------------------------------------------
# Import data for validation ----
data = read.csv(file = "demo/backup/primary_debates_cleaned_separated2.csv", # *** Replace fileNameGoesHere with your data's file name; data has to be saved in .csv format
stringsAsFactors = FALSE,
na.strings = "NA")
text = data[1:1000,]$Text # *** Replace columnNameGoesHere with the name of column that stores all data which needs to be coded
################################################################################
# building a termDocument Matrix #
################################################################################
createVocab = function(text){
RS_corpus = VCorpus(VectorSource(text))
terms <-TermDocumentMatrix(RS_corpus, control = list(wordLengths=c(1, Inf),
removePunctuation = TRUE,removeNumbers = TRUE))
vocab_matrix = as.matrix(terms)
vocab_df = as.data.frame(vocab_matrix)
vocab_df = cbind(Seen = FALSE, vocab_df)
return(vocab_df)
}
# -----------------------------------------------
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #
# Start coding (& Restart point) ----
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #
# Define the Code (big C) that we are testing
CodeName = "terrorism" # *** replace CodeNameGoesHere with your big C Code's name; strongly recommend using camelCase if your Code contains more than one word
CodeDefinition = "terrorism" # *** replace defintionGoesHere with your Code's definition
# Specify all codes (small c), i.e. your word list
## Notice that your word list needs to be in R's regular expression format,
## if you have question about regular expressions, below are some websites that you can check out
## Regular expression introduction & cheatsheets:
## https://www.rexegg.com/regex-quickstart.html
## http://www.cbs.dtu.dk/courses/27610/regular-expressions-cheat-sheet-v2.pdf
## Regular expression testing:
## https://regex101.com/
words = c(
"terror",
"Iran",
"ISIS",
"chaos",
"extremist",
"Laden",
"guerrilla",
"anarch",
"militant",
"enemy",
"extremism",
"jihad"
#"gaddafi"
)
newcode = create.code(
name = CodeName,
definition = CodeDefinition,
expressions = words,
excerpts = text
)
# -----------------------------------------------
# Human Rater 1 ----
# STEP 1: Handcode the Code (big C) we just defined
newcode.h <- handcode(code = newcode, excerpts = text, n = 10) # *** if you want a different sample size, you can change n = your desired sample size
hcy.index <- newcode.h$testSet[,1][newcode.h$testSet[,2]==1] #handcode yes index
hcn.index <- newcode.h$testSet[,1][newcode.h$testSet[,2]==0] #handcode no index
hcn.excerpts <- text[hcn.index] #handcode no excerpts
# STEP 2: Computer code random excerpts !!may be the same as handset
touchedIndices.c <- getHandSetIndices2(newcode)$touchedIndices #computer touched indices
touchableIndices.c <- getHandSetIndices2(newcode)$touchableExcerpts
prioritized.indices <- c(hcy.index, touchedIndices.c, touchableIndices.c)
prioritized.excerpts <- text[prioritized.indices]
#create a prioritized vocab matrix
prioritized.df <- createVocab(prioritized.excerpts)
#create a word document matrix for handcode no exerpts
hcn.terms <- tm::TermDocumentMatrix(tm::VCorpus(tm::VectorSource(hcn.excerpts)),
control = list(wordLengths=c(1, Inf),
removePunctuation = TRUE,removeNumbers = TRUE))
filter_words <- c(row.names(as.matrix(hcn.terms)), words)
prioritized.df$Seen = row.names(prioritized.df) %in% filter_words
unseen_set = prioritized.df[prioritized.df$Seen==FALSE,]
unseen_set = unseen_set[,-c(1)]
prioritized.excerpts[as.numeric(names(sort(colSums(unseen_set), decreasing = TRUE)))[1:2]]
......@@ -6,7 +6,7 @@ rs = RS.data
newcode = create.code(name = "Data", expressions = c("number","data"), excerpts = rs$text)
# Handcode 4 excerpts from RSData
newcode = handcode(code = newcode, excerpts = rs$text, n = 4)
newcode = handcode(code = newcode, excerpts = rs$text, n = 4, unseen = TRUE)
# Run test to see rho/kappa of current test set
newcode = test(code = newcode, kappa_threshold = 0.65)
......
\name{NEWS}
\title{News for Package \pkg{ncodeR}}
\section{Changes in nodeR version 0.2.1.0 (TBD)}{
\itemize{
\item Feature: Implemented the `unseen` paramater in `handcode()`. Adds an additionaltwo excerpts to test set that contain the highest occurrence of words unseen by the coder
}
}
\section{Changes in nodeR version 0.2.0.1 (2019-11-19)}{
\itemize{
\item Bugfix: Missing precision and recall calculation with rho/Kappa
......
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/CodeSet.R
\docType{data}
\name{CodeSet}
\alias{CodeSet}
\title{CodeSet}
\format{An object of class \code{R6ClassGenerator} of length 24.}
\usage{
CodeSet
}
\value{
CodeSet object
......@@ -16,8 +11,14 @@ CodeSet
\description{
Object representing a set of codes
}
\section{Fields}{
\examples{
data(RS.data)
rs = RS.data
code.set = code.set("Demo RS CodeSet", "CodeSet made for the demo", excerpts = rs$text, codes = c())
}
\section{Public fields}{
\if{html}{\out{<div class="r6-fields">}}
\describe{
\item{\code{title}}{Title of the CodeSet}
......@@ -26,12 +27,45 @@ Object representing a set of codes
\item{\code{excerpts}}{Character vector of text excerpts to code (optional)}
\item{\code{expressions}}{Codes to include in the CodeSet (optional)}
}}
}
\if{html}{\out{</div>}}
}
\section{Active bindings}{
\if{html}{\out{<div class="r6-active-bindings">}}
\describe{
\item{\code{expressions}}{Codes to include in the CodeSet (optional)}
}
\if{html}{\out{</div>}}
}
\section{Methods}{
\subsection{Public methods}{
\itemize{
\item \href{#method-new}{\code{CodeSet$new()}}
\item \href{#method-clone}{\code{CodeSet$clone()}}
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-new"></a>}}
\subsection{Method \code{new()}}{
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{CodeSet$new(title = "ne", description = NULL, excerpts = c(), codes = c())}\if{html}{\out{</div>}}
}
\examples{
data(RS.data)
rs = RS.data
code.set = code.set("Demo RS CodeSet", "CodeSet made for the demo", excerpts = rs$text, codes = c())
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-clone"></a>}}
\subsection{Method \code{clone()}}{
The objects of this class are cloneable with this method.
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{CodeSet$clone(deep = FALSE)}\if{html}{\out{</div>}}
}
\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{deep}}{Whether to make a deep clone.}
}
\if{html}{\out{</div>}}