tctrg
tctrg

Reputation: 13

R Error in validObject while running quanteda.textstats

I have around 2000 text files. While I was running textstat_summary I faced the following issue and unsure what to do next. I could somehow identify the problem came from this specific file (maybe there are more).

Error in validObject(.Object) : 
  invalid class “dfm” object: first element of 'p' slot is not 0

This is my code.

As this problem came from a specific file, I attached it here for your references: Link

Any suggestion to fix the error is appreciated.

library(quanteda)
library(quanteda.textstats)
library(tidyverse)

mlist <- list.files(pattern = "\\.txt$", full.names = TRUE)
file_names <- character()
contents <- character()
for (file in mlist) {
  content <- read_lines(file, skip = 7) 
  content <- paste(content, collapse = "\n")
  file_names <- c(file_names, basename(file))
  contents <- c(contents, content)
}

cb_list <- data.frame(filename = file_names, content = contents, stringsAsFactors = FALSE)
cb_list <- cb_list |>
  mutate(co_cik = str_extract(filename, "\\d+_")) |>
  mutate(filing_date = str_extract(filename, "_....-..-.._"))

cb_list$co_cik <- str_remove_all(cb_list$co_cik, "_")
cb_list$filing_date <- str_remove_all(cb_list$filing_date, "_")

crps <- corpus(cb_list, docid_field = "filename", text_field = "content")

text_stat_summary_cb_list <- textstat_summary(crps)

Upvotes: 1

Views: 88

Answers (1)

Mikael Jagan
Mikael Jagan

Reputation: 11336

There seems to be a bug in quanteda, at least in the latest version 4.0.2. I've just filed a report here. Basically, a method for generic function dfm is constructing an invalid dgCMatrix object, non-deterministically.

For a semblance of completeness, here is my debugging output:

debugging in: dfm.tokens_xptr(as.tokens_xptr(x), tolower = tolower, remove_padding = remove_padding, 
    verbose = verbose, ...)
debug: {
    if (is.null(global$object_class)) {
        global$object_class <- class(x)[1]
        global$proc_time <- proc.time()
    }
    check_dots(...)
    if (verbose) 
        catm("Creating a dfm from a", global$object_class, "object...\n")
    x <- as.tokens_xptr(x)
    if (tolower) 
        x <- tokens_tolower(x)
    if (remove_padding) 
        x <- tokens_remove(x, "", valuetype = "fixed")
    attrs <- attributes(x)
    temp <- t(cpp_dfm(x, attrs$meta$object$what == "dictionary"))
    result <- build_dfm(temp, colnames(temp), docvars = get_docvars(x, 
        user = TRUE, system = TRUE), meta = attrs[["meta"]])
    if (verbose) {
        catm(" ...complete, elapsed time:", format((proc.time() - 
            global$proc_time)[3], digits = 3), "seconds.\n")
        catm("Finished constructing a", paste(format(dim(result), 
            big.mark = ",", trim = TRUE), collapse = " x "), 
            "sparse dfm.\n")
    }
    global$object_class <- NULL
    return(result)
}
Browse[1]> n
debug: if (is.null(global$object_class)) {
    global$object_class <- class(x)[1]
    global$proc_time <- proc.time()
}
Browse[1]> n
debug: check_dots(...)
Browse[1]> n
debug: if (verbose) catm("Creating a dfm from a", global$object_class, 
    "object...\n")
Browse[1]> n
debug: x <- as.tokens_xptr(x)
Browse[1]> n
debug: if (tolower) x <- tokens_tolower(x)
Browse[1]> n
debug: x <- tokens_tolower(x)
Browse[1]> n
debug: if (remove_padding) x <- tokens_remove(x, "", valuetype = "fixed")
Browse[1]> n
debug: attrs <- attributes(x)
Browse[1]> n
debug: temp <- t(cpp_dfm(x, attrs$meta$object$what == "dictionary"))
Browse[1]> validObject(cpp_dfm(x, attrs$meta$object$what == "dictionary"))
Error in validObject(cpp_dfm(x, attrs$meta$object$what == "dictionary")) : 
  invalid class "dgCMatrix" object: first differences of 'p' slot exceed Dim[1]
Browse[1]> validObject(cpp_dfm(x, attrs$meta$object$what == "dictionary"))
[1] TRUE
Browse[1]> validObject(cpp_dfm(x, attrs$meta$object$what == "dictionary"))
[1] TRUE
Browse[1]>

Upvotes: 0

Related Questions