Reputation: 1
I want to modify this function so that it handles a vector of species rather than one species at a time.
merge_TATB_standalone <- function(ta, tb , species, country = "all",
strata = BioIndex::strata_scheme,
wd = NA, save = TRUE, verbose = TRUE) {
# For testing purposes only
if (FALSE) {
verbose <- TRUE
wd <- "wd"
species <- "ARISFOL"
country <- "all"
}
# Load or declare helper functions and objects as needed.
# For time conversion, ensure you have installed and loaded the hms package:
# install.packages("hms"); library(hms)
# The following line prevents errors if these functions are not available,
# but you need to have them in your environment or replace them with your own versions.
check_date_haul <- check_dictionary <- check_hauls_TBTA <- check_numeric_range <-
convert_coordinates <- hms <- write.table <- NULL
TA_cols <- BioIndex::TA_cols # expected columns for TA
TB_cols <- BioIndex::TB_cols # expected columns for TB
# Process the species code: here we assume the species string is at least 7 characters long.
# Adjust as necessary.
species[2] <- substr(species, 5, 7)
species[1] <- substr(species[1], 1, 4)
sspp <- paste(species[1], species[2], sep = "")
# Assign data frames
TA <- ta
TB <- tb
# Check that the countries in TA are also present in TB
TA_country <- sort(unique(as.character(TA$COUNTRY)))
TB_country <- sort(unique(as.character(TB$COUNTRY)))
for (i in 1:length(TA_country)) {
if (!(TA_country[i] %in% TB_country)) {
warning(paste("The country", TA_country[i], "is not present in the TB file."))
}
}
# Determine which countries to analyze.
if (length(TA_country) == 1) {
country_analysis <- TA_country
} else {
if (any(country %in% "all")) {
country_analysis <- TA_country
} else {
country_analysis <- country
}
}
# Subset TA and TB by country
TA <- TA[TA$COUNTRY %in% country_analysis, ]
TB <- TB[TB$COUNTRY %in% country_analysis, ]
# Create a unique haul identifier (id) for both TA and TB
id_TA <- data.frame(id = paste(TA$AREA, TA$COUNTRY, TA$YEAR, "_",
TA$VESSEL, TA$MONTH, TA$DAY, "_", TA$HAUL_NUMBER, sep = ""))
id_TB <- data.frame(id = paste(TB$AREA, TB$COUNTRY, TB$YEAR, "_",
TB$VESSEL, TB$MONTH, TB$DAY, "_", TB$HAUL_NUMBER, sep = ""))
# Remove invalid hauls (assumes TA has a column VALIDITY where "I" indicates invalid)
id_invalid <- id_TA$id[which(TA$VALIDITY == "I")]
# (Optional) Quality checks can be performed here on TA and TB by year.
yyy <- sort(unique(TA$YEAR))
suffix <- paste(as.character(Sys.Date()), format(Sys.time(), "_time_h%Hm%Ms%OS0"), sep = "")
for (yy in 1:length(yyy)) {
TA_year <- TA[TA$YEAR == yyy[yy], ]
TB_year <- TB[TB$YEAR == yyy[yy], ]
Field <- "SHOOTING_TIME"
Values <- seq(0, 2400, 1)
if (!check_dictionary(ResultData = TA_year, Field, Values,
year = yyy[yy], wd = file.path(wd, "output"),
suffix = suffix)) {
stop("SHOOTING_TIME value out of allowed range. Please check the logfile.")
}
Field <- "HAULING_TIME"
Values <- seq(0, 2400, 1)
if (!check_dictionary(ResultData = TA_year, Field, Values,
year = yyy[yy], wd = file.path(wd, "output"),
suffix = suffix)) {
stop("HAULING_TIME value out of allowed range. Please check the logfile.")
}
Field <- "WING_OPENING"
Values <- c(30, 50:250)
if (!check_dictionary(ResultData = TA_year, Field, Values,
year = yyy[yy], wd = file.path(wd, "output"),
suffix = suffix)) {
stop("WING_OPENING value out of allowed range. Please check the logfile.")
}
Field <- "SHOOTING_LATITUDE"
Values <- c(3020, 4730)
if (!check_numeric_range(Data = TA_year, Field, Values,
year = yyy[yy], wd = file.path(wd, "output"),
suffix = suffix)) {
stop("SHOOTING_LATITUDE in TA out of allowed range. Please check the logfile.")
}
Field <- "HAULING_LATITUDE"
Values <- c(3020, 4730)
if (!check_numeric_range(Data = TA_year, Field, Values,
year = yyy[yy], wd = file.path(wd, "output"),
suffix = suffix)) {
stop("HAULING_LATITUDE in TA out of allowed range. Please check the logfile.")
}
Field <- "SHOOTING_LONGITUDE"
Values <- c(0, 4200)
if (!check_numeric_range(Data = TA_year, Field, Values,
year = yyy[yy], wd = file.path(wd, "output"),
suffix = suffix)) {
stop("SHOOTING_LONGITUDE in TA out of allowed range. Please check the logfile.")
}
Field <- "HAULING_LONGITUDE"
Values <- c(0, 4200)
if (!check_numeric_range(Data = TA_year, Field, Values,
year = yyy[yy], wd = file.path(wd, "output"),
suffix = suffix)) {
stop("HAULING_LONGITUDE in TA out of allowed range. Please check the logfile.")
}
if (!check_date_haul(TA_year, TB_year, year = yyy[yy],
wd = file.path(wd, "output"), suffix = suffix)) {
stop("Date in TB not consistent with TA. Please check the logfile.")
}
if (!check_hauls_TBTA(TA_year, TB_year, year = yyy[yy],
wd = file.path(wd, "output"), suffix = suffix)) {
stop("Haul in TB not reported in TA. Please check the logfile.")
}
}
# Rename column "AREA" to "GSA" in both datasets
colnames(TA)[which(colnames(TA) == "AREA")] <- "GSA"
colnames(TB)[which(colnames(TB) == "AREA")] <- "GSA"
# Attach the id fields to the main data frames
TA_merge <- cbind(id_TA, TA)
TB_merge <- cbind(id_TB, TB)
TB_merge$GENUS <- as.character(TB_merge$GENUS)
TB_merge$SPECIES <- as.character(TB_merge$SPECIES)
# Remove invalid hauls
TA_merge <- TA_merge[!TA_merge$id %in% id_invalid, ]
TB_merge <- TB_merge[!TB_merge$id %in% id_invalid, ]
# Keep only the desired columns (based on BioIndex definitions)
TA_merge <- TA_merge[, which(colnames(TA_merge) %in% TA_cols)]
TB_merge <- TB_merge[TB_merge$GENUS == species[1] & TB_merge$SPECIES == species[2],
which(colnames(TB_merge) %in% TB_cols)]
if (verbose) {
message("- Merging TA-TB files")
}
# Merge TA and TB by the unique id
merge_TATB <- merge(TA_merge, TB_merge, by.x = "id", by.y = "id", all.x = TRUE)
# Create a MEDITS_CODE field from GENUS and SPECIES
merge_TATB$MEDITS_CODE <- as.character(paste(merge_TATB$GENUS, merge_TATB$SPECIES))
# Replace missing or invalid MEDITS_CODE with default values
l_TATB <- nrow(merge_TATB)
if(l_TATB > 0) {
for (i in 1:l_TATB) {
if (merge_TATB[i, "MEDITS_CODE"] == "NA NA") {
merge_TATB[i, "MEDITS_CODE"] <- "NA"
merge_TATB[i, "GENUS"] <- -1
merge_TATB[i, "SPECIES"] <- -1
merge_TATB[i, "TOTAL_WEIGHT_IN_THE_HAUL"] <- 0
merge_TATB[i, "TOTAL_NUMBER_IN_THE_HAUL"] <- 0
merge_TATB[i, "NB_OF_FEMALES"] <- 0
merge_TATB[i, "NB_OF_MALES"] <- 0
merge_TATB[i, "NB_OF_UNDETERMINED"] <- 0
}
}
}
# (Optional) Coordinate conversion if needed (using convert_coordinates)
coord <- convert_coordinates(merge_TATB)
merge_TATB$MEAN_LATITUDE <- (merge_TATB$SHOOTING_LATITUDE + merge_TATB$HAULING_LATITUDE) / 2
merge_TATB$MEAN_LATITUDE_DEC <- (coord$lat_start + coord$lat_end) / 2
merge_TATB$MEAN_LONGITUDE <- (merge_TATB$SHOOTING_LONGITUDE + merge_TATB$HAULING_LONGITUDE) / 2
merge_TATB$MEAN_LONGITUDE_DEC <- (coord$lon_start + coord$lon_end) / 2
merge_TATB$MEAN_DEPTH <- (merge_TATB$SHOOTING_DEPTH + merge_TATB$HAULING_DEPTH) / 2
merge_TATB$SWEPT_AREA <- merge_TATB$DISTANCE * merge_TATB$WING_OPENING / 1e+07
# Process time fields to calculate haul duration, densities, etc.
l_TATB <- nrow(merge_TATB)
# Initialize vectors for shooting and hauling times
hour_shooting <- rep(NA, l_TATB)
min_shooting <- rep(NA, l_TATB)
hour_hauling <- rep(NA, l_TATB)
min_hauling <- rep(NA, l_TATB)
# Optionally, subset the strata scheme if needed (using %in% to avoid length warnings)
strata_scheme <- strata_scheme[strata_scheme$GSA %in% unique(merge_TATB$GSA) &
strata_scheme$COUNTRY %in% as.character(unique(merge_TATB$COUNTRY)), ]
# Loop over each row to assign a STRATUM_CODE and extract time components
for (i in 1:l_TATB) {
for (j in 1:nrow(strata_scheme)) {
if (strata_scheme[j, 3] == 1) {
if (floor(merge_TATB$MEAN_DEPTH[i]) >= strata_scheme[j, 4] &&
floor(merge_TATB$MEAN_DEPTH[i]) <= strata_scheme[j, 5]) {
merge_TATB$STRATUM_CODE[i] <- strata_scheme[j, 3]
}
} else {
if (floor(merge_TATB$MEAN_DEPTH[i]) > strata_scheme[j, 4] &&
floor(merge_TATB$MEAN_DEPTH[i]) <= strata_scheme[j, 5]) {
merge_TATB$STRATUM_CODE[i] <- strata_scheme[j, 3]
}
}
}
# Extract hours and minutes for SHOOTING_TIME
if (nchar(merge_TATB$SHOOTING_TIME[i]) == 4) {
hour_shooting[i] <- substr(merge_TATB$SHOOTING_TIME[i], 1, 2)
min_shooting[i] <- substr(merge_TATB$SHOOTING_TIME[i], 3, 4)
} else if (nchar(merge_TATB$SHOOTING_TIME[i]) == 3) {
hour_shooting[i] <- substr(merge_TATB$SHOOTING_TIME[i], 1, 1)
min_shooting[i] <- substr(merge_TATB$SHOOTING_TIME[i], 2, 3)
}
# Extract hours and minutes for HAULING_TIME
if (nchar(merge_TATB$HAULING_TIME[i]) == 4) {
hour_hauling[i] <- substr(merge_TATB$HAULING_TIME[i], 1, 2)
min_hauling[i] <- substr(merge_TATB$HAULING_TIME[i], 3, 4)
} else if (nchar(merge_TATB$HAULING_TIME[i]) == 3) {
hour_hauling[i] <- substr(merge_TATB$HAULING_TIME[i], 1, 1)
min_hauling[i] <- substr(merge_TATB$HAULING_TIME[i], 2, 3)
}
}
# Convert the extracted time components into a time object using hms
hms_shooting <- hms(rep(0, l_TATB), as.numeric(min_shooting), as.numeric(hour_shooting))
hms_hauling <- hms(rep(0, l_TATB), as.numeric(min_hauling), as.numeric(hour_hauling))
# Calculate duration in hours (assuming hms subtraction yields seconds)
duration <- as.numeric(hms_hauling - hms_shooting) / 3600
merge_TATB$SHOOTING_TIME <- hms_shooting
merge_TATB$HAULING_TIME <- hms_hauling
merge_TATB$N_h <- merge_TATB$TOTAL_NUMBER_IN_THE_HAUL / duration
merge_TATB$N_km2 <- merge_TATB$TOTAL_NUMBER_IN_THE_HAUL / merge_TATB$SWEPT_AREA
merge_TATB$kg_h <- merge_TATB$TOTAL_WEIGHT_IN_THE_HAUL / duration / 1000
merge_TATB$kg_km2<- merge_TATB$TOTAL_WEIGHT_IN_THE_HAUL / merge_TATB$SWEPT_AREA / 1000
merge_TATB$MEDITS_CODE <- as.character(merge_TATB$MEDITS_CODE)
if (save) {
outfile <- paste0(wd, "/output/mergeTATB_", species[1], species[2], ".csv")
write.table(merge_TATB, outfile, sep = ";", row.names = FALSE)
if (verbose) {
message("TA-TB files correctly merged")
message(paste("Merge TA-TB file saved in:", outfile))
}
} else {
if (verbose) {
message("TA-TB files correctly merged (file not saved)")
}
}
return(merge_TATB)
}
I tried looping the function for a species vector but nothing i did worked.
Upvotes: -1
Views: 51
Reputation: 6770
Without seeing example data (and understanding its structure) it's impossible to provide very specific code. So I am going to give you some general advice that should get you closer.
Your code is complicated and does many things. I would recommend that you start by creating some specific functions for each subsection of your function. Then you can call those functions from within your function. That will make it easier to determine where things are going wrong. It will also make it easier to reach your goal of being able to generate outputs for each species separately.
For example, you also have a lot of things with pattern
Field <- "WING_OPENING"
Values <- c(30, 50:250)
if (!check_dictionary(ResultData = TA_year, Field, Values,
year = yyy[yy], wd = file.path(wd, "output"),
suffix = suffix)) {
stop("WING_OPENING value out of allowed range. Please check the logfile.")
}
If you change the last line to
stop(paste0( Field, " value out of allowed range. Please check the logfile."))
There would be no reason to keep repeating that code. You could simply *apply() that function across a data frame that has the Field and values data.
Also ask yourself if there are some things that can be done one time for all of the data before you start processing individual species. For example, all of the date time recoding ... if that is the same for all the species do it one time.
Lastly, to check if a package (hms) is installed see this answer Check for installed packages before running install.packages().
Adding
It looks like species is only used here:
species[2] <- substr(species, 5, 7)
species[1] <- substr(species[1], 1, 4)
sspp <- paste(species[1], species[2], sep = "")
And this logic is not going to work if your vector of species is longer than length 1 since you overwrite species[2].
You create sspp
but don't ever do anything with it.
What is it that you want to do with it?
species2 <- substr(species, 5, 7)
species3 <- substr(species, 1, 4)
sspp <- paste(species2, species3, sep = " ")
Would give you a vectorized result.
species <- c("aaaaaaaaaaa", "bbbbbbbbbb", "cccccccccc" )
species2 <- substr(species, 5, 7)
species3 <- substr(species, 1, 4)
sspp <- paste(species2, species3, sep = " ")
sspp
[1] "aaa aaaa" "bbb bbbb" "ccc cccc"
Upvotes: 0