From a8e8c6b197c6f3eef5534abbbe22cb424f7a9fab Mon Sep 17 00:00:00 2001 From: muschellij2 Date: Mon, 14 May 2018 15:16:13 -0400 Subject: [PATCH 01/20] - change .update_tab_path to be relative if path = "." for ukb_df --- R/ukb_dataset.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/ukb_dataset.R b/R/ukb_dataset.R index 09b945f..0f3c3b8 100644 --- a/R/ukb_dataset.R +++ b/R/ukb_dataset.R @@ -206,8 +206,8 @@ ukb_df_field <- function(fileset, path = ".", data.pos = 2, as.lookup = FALSE) { # Update path to tab file in R source if(path == ".") { - tab_location <- file.path(getwd(), tab_file) - r_location <- file.path(getwd(), r_file) + tab_location <- tab_file + r_location <- r_file } else { tab_location <- file.path(path, tab_file) r_location <- file.path(path, r_file) From df2747799aac0cff6af094eef560bdab7006369f Mon Sep 17 00:00:00 2001 From: muschellij2 Date: Wed, 23 May 2018 09:23:23 -0400 Subject: [PATCH 02/20] added desc --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 8acdbd5..d2683cf 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -24,7 +24,7 @@ Imports: scales, stringr, data.table -RoxygenNote: 6.0.1 +RoxygenNote: 6.0.1.9000 Suggests: knitr, rmarkdown VignetteBuilder: knitr From 710785df195250100aa2b0b37b822df943314cb9 Mon Sep 17 00:00:00 2001 From: muschellij2 Date: Fri, 25 May 2018 14:48:40 -0400 Subject: [PATCH 03/20] merged with PR --- R/ukb_dataset.R | 10 ---------- man/ukb_gen_samples_to_remove.Rd | 4 ++-- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/R/ukb_dataset.R b/R/ukb_dataset.R index 7881d6e..1fd99f8 100644 --- a/R/ukb_dataset.R +++ b/R/ukb_dataset.R @@ -202,18 +202,8 @@ ukb_df_field <- function(fileset, path = ".", data.pos = 2, as.lookup = FALSE) { tab_file <- stringr::str_interp("${fileset}.tab") # Update path to tab file in R source -<<<<<<< HEAD - if(path == ".") { - tab_location <- tab_file - r_location <- r_file - } else { - tab_location <- file.path(path, tab_file) - r_location <- file.path(path, r_file) - } -======= tab_location <- file.path(path, tab_file) r_location <- file.path(path, r_file) ->>>>>>> upstream/master edit_date <- Sys.time() diff --git a/man/ukb_gen_samples_to_remove.Rd b/man/ukb_gen_samples_to_remove.Rd index 63f37fc..f65d92d 100644 --- a/man/ukb_gen_samples_to_remove.Rd +++ b/man/ukb_gen_samples_to_remove.Rd @@ -30,8 +30,8 @@ Date: Wed, 26 Jul 2017 17:06:01 +0100 (...) you could use the list of samples which we used to calculate the PCs, which is a (maximal) subset of unrelated participants after applying some QC -filtering. Please read supplementary Section S 3.3.2 for details. You can -find the list of samples using the “used.in.pca.calculation" column in the +filtering. Please read supplementary Section S3.3.2 for details. You can +find the list of samples using the "used.in.pca.calculation" column in the sample-QC file (ukb_sqc_v2.txt) (...). Note that this set contains diverse ancestries. If you take the intersection with the white British ancestry subset you get ~337,500 unrelated samples. From cf0eea9df2aeca2af2bf63298b62aae61a40f798 Mon Sep 17 00:00:00 2001 From: muschellij2 Date: Tue, 26 Jun 2018 19:45:43 -0400 Subject: [PATCH 04/20] fixing typo in the Rd file --- R/ukb_dataset.R | 2 +- man/ukb_df_full_join.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/ukb_dataset.R b/R/ukb_dataset.R index 1fd99f8..8d41234 100644 --- a/R/ukb_dataset.R +++ b/R/ukb_dataset.R @@ -244,7 +244,7 @@ ukb_df_field <- function(fileset, path = ".", data.pos = 2, as.lookup = FALSE) { #' @param ... Supply comma separated unquoted names of to-be-merged UKB datasets (created with \code{\link{ukb_df}}). Arguments are passed to \code{list}. #' @param by Variable used to merge multiple dataframes (default = "eid"). #' -#' @details The function takes a comma separated list of unquoted datasets. By explicitly setting the join key to "eid" only (Default value of the \code{by} parameter), any additional variables common to any two tables will have ".x" and ".y" appended to their names. If you are satisfied the additional variables are identical to the original, the copies can be safely deleted. For example, if \code{setequal(my_ukb_data$var, my_ukb_data$var.x)} is \code{TRUE}, then my_ukb_data$var.x can be dropped. A \code{dlyr::full_join} is like the set operation union in that all observations from all tables are included, i.e., all samples are included even if they are not included in all datasets. +#' @details The function takes a comma separated list of unquoted datasets. By explicitly setting the join key to "eid" only (Default value of the \code{by} parameter), any additional variables common to any two tables will have ".x" and ".y" appended to their names. If you are satisfied the additional variables are identical to the original, the copies can be safely deleted. For example, if \code{setequal(my_ukb_data$var, my_ukb_data$var.x)} is \code{TRUE}, then my_ukb_data$var.x can be dropped. A \code{dplyr::full_join} is like the set operation union in that all observations from all tables are included, i.e., all samples are included even if they are not included in all datasets. #' #' NB. \code{ukb_df_full_join} will fail if any variable names are repeated **within** a single UKB dataset. This is unlikely to occur, however, \code{ukb_df} creates variable names by combining a snake_case descriptor with the variable's **index** and **array**. If an index_array combination is incorrectly repeated, this will result in a duplicated variable. If the join fails, you can use \code{\link{ukb_df_duplicated_name}} to find duplicated names. See \code{vignette(topic = "explore-ukb-data", package = "ukbtools")} for further details. #' diff --git a/man/ukb_df_full_join.Rd b/man/ukb_df_full_join.Rd index d8872bd..c556e13 100644 --- a/man/ukb_df_full_join.Rd +++ b/man/ukb_df_full_join.Rd @@ -15,7 +15,7 @@ ukb_df_full_join(..., by = "eid") A thin wrapper around \code{purrr::reduce} and \code{dplyr::full_join} to merge multiple UKB datasets. } \details{ -The function takes a comma separated list of unquoted datasets. By explicitly setting the join key to "eid" only (Default value of the \code{by} parameter), any additional variables common to any two tables will have ".x" and ".y" appended to their names. If you are satisfied the additional variables are identical to the original, the copies can be safely deleted. For example, if \code{setequal(my_ukb_data$var, my_ukb_data$var.x)} is \code{TRUE}, then my_ukb_data$var.x can be dropped. A \code{dlyr::full_join} is like the set operation union in that all observations from all tables are included, i.e., all samples are included even if they are not included in all datasets. +The function takes a comma separated list of unquoted datasets. By explicitly setting the join key to "eid" only (Default value of the \code{by} parameter), any additional variables common to any two tables will have ".x" and ".y" appended to their names. If you are satisfied the additional variables are identical to the original, the copies can be safely deleted. For example, if \code{setequal(my_ukb_data$var, my_ukb_data$var.x)} is \code{TRUE}, then my_ukb_data$var.x can be dropped. A \code{dplyr::full_join} is like the set operation union in that all observations from all tables are included, i.e., all samples are included even if they are not included in all datasets. NB. \code{ukb_df_full_join} will fail if any variable names are repeated **within** a single UKB dataset. This is unlikely to occur, however, \code{ukb_df} creates variable names by combining a snake_case descriptor with the variable's **index** and **array**. If an index_array combination is incorrectly repeated, this will result in a duplicated variable. If the join fails, you can use \code{\link{ukb_df_duplicated_name}} to find duplicated names. See \code{vignette(topic = "explore-ukb-data", package = "ukbtools")} for further details. } From 9a26988cd0bea32ed35281aae203b66670134e4e Mon Sep 17 00:00:00 2001 From: muschellij2 Date: Mon, 22 Jun 2020 16:27:35 -0400 Subject: [PATCH 05/20] udated the rd --- R/uk_util.R | 3 ++- man/ukb_util_path.Rd | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/R/uk_util.R b/R/uk_util.R index b6b265e..9951c11 100644 --- a/R/uk_util.R +++ b/R/uk_util.R @@ -26,7 +26,8 @@ sys_type <- function() { #' @export #' #' @examples -#' ukb_util_path("ukbmd5") +#' md5 = ukb_util_path("ukbmd5") +#' file.remove(md5) ukb_util_path = function( util = c("ukbmd5", "ukbconv", "ukbunpack", "ukbfetch", "ukblink", "ukbgene", diff --git a/man/ukb_util_path.Rd b/man/ukb_util_path.Rd index 99b8659..722b61d 100644 --- a/man/ukb_util_path.Rd +++ b/man/ukb_util_path.Rd @@ -28,5 +28,6 @@ A path to the utility Get Path to UKB Utilitiy } \examples{ -ukb_util_path("ukbmd5") +md5 = ukb_util_path("ukbmd5") +file.remove(md5) } From 650576ce0a6219ad8e8cb45029a150d0fcca852b Mon Sep 17 00:00:00 2001 From: muschellij2 Date: Mon, 22 Jun 2020 16:39:18 -0400 Subject: [PATCH 06/20] updated the utilities --- NAMESPACE | 3 +++ R/uk_util.R | 50 ++++++++++++++++++++++++++++++++++++++++++++++++-- man/ukb_md5.Rd | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+), 2 deletions(-) create mode 100644 man/ukb_md5.Rd diff --git a/NAMESPACE b/NAMESPACE index b96a078..9d49442 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,7 @@ export(ukb_centre) export(ukb_context) +export(ukb_conv) export(ukb_df) export(ukb_df_duplicated_name) export(ukb_df_field) @@ -27,6 +28,8 @@ export(ukb_icd_diagnosis) export(ukb_icd_freq_by) export(ukb_icd_keyword) export(ukb_icd_prevalence) +export(ukb_md5) +export(ukb_unpack) export(ukb_util_path) import(dplyr) import(ggplot2) diff --git a/R/uk_util.R b/R/uk_util.R index 9951c11..54ae24e 100644 --- a/R/uk_util.R +++ b/R/uk_util.R @@ -69,6 +69,7 @@ ukb_util_path = function( download.file(url, destfile = destfile, mode = "wb") } + Sys.chmod(destfile) return(destfile) } else { stop( @@ -88,7 +89,52 @@ ukb_encoding = function( outdir = tempdir()) { outdir = outdir) } -ukb_md5 = function(filename, checksum, ...) { +#' UKB MD5 Checksum +#' +#' @param file name of file to run utility on +#' @param ... additional arguments to pass to +#' \code{\link{ukb_util_path}} +#' +#' @return A character string +#' +#' @export +ukb_md5 = function(file, ...) { path = ukb_util_path("ukbmd5", ...) - out = system2(path, filename) + out = system2(path, file, stdout = TRUE) + out = out[grepl("MD5=", out)] + out = sub(".*MD5=", "", out) + return(out) +} + + +#' @rdname ukb_md5 +#' @param key file to key to unpack/decrypt file +#' @export +ukb_unpack = function(file, key, ...) { + path = ukb_util_path("ukbunpack", ...) + out = system2(path, c(file, key)) + if (out != 0) { + warning("Unpacking did not seem to complete successfully") + } + out = paste0(file, "_ukb") + return(out) +} + + +#' @rdname ukb_md5 +#' @param type type of conversion to do +#' @export +ukb_conv = function(file, + type = c("r", "docs", + "csv", "sas", + "stata", + "lims", "bulk", + "txt"), ...) { + type = match.arg(type) + path = ukb_util_path("ukbconv", ...) + out = system2(path, c(file, type)) + if (out != 0) { + warning("Convert did not seem to complete successfully") + } + return(out) } diff --git a/man/ukb_md5.Rd b/man/ukb_md5.Rd new file mode 100644 index 0000000..ab7e4dd --- /dev/null +++ b/man/ukb_md5.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/uk_util.R +\name{ukb_md5} +\alias{ukb_md5} +\alias{ukb_unpack} +\alias{ukb_conv} +\title{UKB MD5 Checksum} +\usage{ +ukb_md5(file, ...) + +ukb_unpack(file, key, ...) + +ukb_conv( + file, + type = c("r", "docs", "csv", "sas", "stata", "lims", "bulk", "txt"), + ... +) +} +\arguments{ +\item{file}{name of file to run utility on} + +\item{...}{additional arguments to pass to +\code{\link{ukb_util_path}}} + +\item{key}{file to key to unpack/decrypt file} + +\item{type}{type of conversion to do} +} +\value{ +A character string +} +\description{ +UKB MD5 Checksum +} From d2e1b542061774c8cf9674dc0a514aa92dbcf08c Mon Sep 17 00:00:00 2001 From: muschellij2 Date: Mon, 22 Jun 2020 16:41:53 -0400 Subject: [PATCH 07/20] updated the utilities --- DESCRIPTION | 3 ++- R/uk_util.R | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 197d324..5aac774 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -29,7 +29,8 @@ Imports: doParallel, lifecycle, xml2, - rvest + rvest, + utils RoxygenNote: 7.1.0 Suggests: knitr, diff --git a/R/uk_util.R b/R/uk_util.R index 54ae24e..5bd2a1e 100644 --- a/R/uk_util.R +++ b/R/uk_util.R @@ -66,7 +66,7 @@ ukb_util_path = function( util, ext) destfile = file.path(outdir, basename(url)) if (!file.exists(destfile)) { - download.file(url, destfile = destfile, + utils::download.file(url, destfile = destfile, mode = "wb") } Sys.chmod(destfile) From 10d08aa0c4762a91bf6e1c327d748760cb74de15 Mon Sep 17 00:00:00 2001 From: muschellij2 Date: Mon, 22 Jun 2020 17:01:17 -0400 Subject: [PATCH 08/20] updated news --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 7c10424..d520a20 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,6 +5,8 @@ Bug fix: dplyr update broke `ukb_icd_diagnosis`. Fixed in dev version. +* Added `ukb_util_*` functions to be able to perform system-level calls for fetching and unpacking. + # ukbtools 0.11.3 From 7897fc6f3cead50fdff17fbcc3a367764192dcfc Mon Sep 17 00:00:00 2001 From: muschellij2 Date: Mon, 22 Jun 2020 19:17:19 -0400 Subject: [PATCH 09/20] aded fetch --- NAMESPACE | 1 + R/uk_util.R | 43 +++++++++++++++++++++++++++++++++++++++++-- man/ukb_md5.Rd | 5 +++++ 3 files changed, 47 insertions(+), 2 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 9d49442..a7e34b4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,6 +8,7 @@ export(ukb_df_duplicated_name) export(ukb_df_field) export(ukb_df_full_join) export(ukb_encoding) +export(ukb_fetch_bulk) export(ukb_gen_excl) export(ukb_gen_excl_to_na) export(ukb_gen_het) diff --git a/R/uk_util.R b/R/uk_util.R index 5bd2a1e..29e58b8 100644 --- a/R/uk_util.R +++ b/R/uk_util.R @@ -56,7 +56,8 @@ ukb_util_path = function( if (!nzchar(noah)) { warning( paste0( - "You may need noah to use these linux execs on Mac OSX,", + "You may need noah to use these ", + "linux execs on Mac OSX,", " See https://github.com/linux-noah/noah ") ) } @@ -67,7 +68,7 @@ ukb_util_path = function( destfile = file.path(outdir, basename(url)) if (!file.exists(destfile)) { utils::download.file(url, destfile = destfile, - mode = "wb") + mode = "wb") } Sys.chmod(destfile) return(destfile) @@ -138,3 +139,41 @@ ukb_conv = function(file, } return(out) } + +#' @rdname ukb_md5 +#' @param start start of the fetching, 1-indexed +#' @export +ukb_fetch_bulk = function( + file, + key, + start = NULL, + ...) { + stopifnot(file.exists(file)) + + n_max = 1000 + if (is.null(start)) { + x = readLines(file) + n = length(x) + if (n > n_max) { + start = (seq(0, ceiling(n / n_max) -1) * n_max) + 1 + } else { + start = 1 + } + } + + path = ukb_util_path("ukbfetch", ...) + bfile = paste0("-b", file) + akey = paste0("-a", key) + + starts = paste0("-s", start) + x = starts[1] + num = paste0("-m", n_max) + res = sapply(starts, function(x) { + out = system2(path, c(bfile, akey, x, num)) + if (out != 0) { + warning("Convert did not seem to complete successfully") + } + out + }) + return(res) +} diff --git a/man/ukb_md5.Rd b/man/ukb_md5.Rd index ab7e4dd..b239346 100644 --- a/man/ukb_md5.Rd +++ b/man/ukb_md5.Rd @@ -4,6 +4,7 @@ \alias{ukb_md5} \alias{ukb_unpack} \alias{ukb_conv} +\alias{ukb_fetch_bulk} \title{UKB MD5 Checksum} \usage{ ukb_md5(file, ...) @@ -15,6 +16,8 @@ ukb_conv( type = c("r", "docs", "csv", "sas", "stata", "lims", "bulk", "txt"), ... ) + +ukb_fetch_bulk(file, key, start = NULL, ...) } \arguments{ \item{file}{name of file to run utility on} @@ -25,6 +28,8 @@ ukb_conv( \item{key}{file to key to unpack/decrypt file} \item{type}{type of conversion to do} + +\item{start}{start of the fetching, 1-indexed} } \value{ A character string From 838fbd0f8077cb51b502c078cd492eb62974ab4c Mon Sep 17 00:00:00 2001 From: muschellij2 Date: Mon, 22 Jun 2020 19:29:20 -0400 Subject: [PATCH 10/20] updated key --- R/uk_util.R | 12 ++++++++++++ man/ukb_md5.Rd | 4 +++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/R/uk_util.R b/R/uk_util.R index 29e58b8..ed3ea0c 100644 --- a/R/uk_util.R +++ b/R/uk_util.R @@ -142,14 +142,26 @@ ukb_conv = function(file, #' @rdname ukb_md5 #' @param start start of the fetching, 1-indexed +#' @param outdir output directory of download #' @export ukb_fetch_bulk = function( file, key, start = NULL, + outdir = NULL, ...) { stopifnot(file.exists(file)) + owd = getwd() + if (!is.null(outdir)) { + setwd(outdir) + on.exit({ + setwd(owd) + }, add = TRUE) + } + file = normalizePath(file, mustWork = TRUE, winslash = "/") + key = normalizePath(key, mustWork = TRUE, winslash = "/") + n_max = 1000 if (is.null(start)) { x = readLines(file) diff --git a/man/ukb_md5.Rd b/man/ukb_md5.Rd index b239346..b9bbc91 100644 --- a/man/ukb_md5.Rd +++ b/man/ukb_md5.Rd @@ -17,7 +17,7 @@ ukb_conv( ... ) -ukb_fetch_bulk(file, key, start = NULL, ...) +ukb_fetch_bulk(file, key, start = NULL, outdir = NULL, ...) } \arguments{ \item{file}{name of file to run utility on} @@ -30,6 +30,8 @@ ukb_fetch_bulk(file, key, start = NULL, ...) \item{type}{type of conversion to do} \item{start}{start of the fetching, 1-indexed} + +\item{outdir}{output directory of download} } \value{ A character string From 4defefb066d84f488806728ca1d4baac4e6a0977 Mon Sep 17 00:00:00 2001 From: muschellij2 Date: Mon, 22 Jun 2020 19:40:11 -0400 Subject: [PATCH 11/20] updated the normalizePath --- R/uk_util.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/R/uk_util.R b/R/uk_util.R index ed3ea0c..7b2ff11 100644 --- a/R/uk_util.R +++ b/R/uk_util.R @@ -152,6 +152,9 @@ ukb_fetch_bulk = function( ...) { stopifnot(file.exists(file)) + file = normalizePath(file, mustWork = TRUE, winslash = "/") + key = normalizePath(key, mustWork = TRUE, winslash = "/") + owd = getwd() if (!is.null(outdir)) { setwd(outdir) @@ -159,8 +162,6 @@ ukb_fetch_bulk = function( setwd(owd) }, add = TRUE) } - file = normalizePath(file, mustWork = TRUE, winslash = "/") - key = normalizePath(key, mustWork = TRUE, winslash = "/") n_max = 1000 if (is.null(start)) { From 69362bfd80912ce1ca88235759c0ff7012d3539e Mon Sep 17 00:00:00 2001 From: muschellij2 Date: Mon, 29 Jun 2020 16:22:06 -0400 Subject: [PATCH 12/20] added temporary argument to get around permissions failures --- R/dataset.R | 24 ++++++++++++++++++++---- R/uk_util.R | 6 ++++++ man/ukb_df.Rd | 5 ++++- 3 files changed, 30 insertions(+), 5 deletions(-) diff --git a/R/dataset.R b/R/dataset.R index ec1dfaa..510b179 100644 --- a/R/dataset.R +++ b/R/dataset.R @@ -14,6 +14,8 @@ globalVariables( #' @param path The path to the directory containing your UKB fileset. The default value is the current directory. #' @param n_threads Either "max" (uses the number of cores, `parallel::detectCores()`), "dt" (default - uses the data.table default, `data.table::getDTthreads()`), or a numerical value (in which case n_threads is set to the supplied value, or `parallel::detectCores()` if it is smaller). #' @param data.pos Locates the data in your .html file. The .html file is read into a list; the default value data.pos = 2 indicates the second item in the list. (The first item in the list is the title of the table). You will probably not need to change this value, but if the need arises you can open the .html file in a browser and identify where in the file the data is. +#' @param temporary Should the `R` file be copied to a temporary directory? +#' Useful for permissions issues, especially on computing clusters. #' #' @details The \strong{index} and \strong{array} from the UKB field code are preserved in the variable name, as two numbers separated by underscores at the end of the name e.g. \emph{variable_index_array}. \strong{index} refers the assessment instance (or visit). \strong{array} captures multiple answers to the same "question". See UKB documentation for detailed descriptions of \href{http://biobank.ctsu.ox.ac.uk/crystal/instance.cgi?id=2}{index} and \href{http://biobank.ctsu.ox.ac.uk/crystal/help.cgi?cd=array}{array}. #' @@ -45,7 +47,10 @@ globalVariables( #' ukb_df_full_join(ukb1234_data, ukb2345_data, ukb3456_data) #' } #' -ukb_df <- function(fileset, path = ".", n_threads = "dt", data.pos = 2) { +ukb_df <- function(fileset, path = ".", n_threads = "dt", data.pos = 2, + temporary = FALSE) { + + fileset = stringr::str_replace(fileset, "[.](r|html|tab)$", "") # Check files exist html_file <- stringr::str_interp("${fileset}.html") @@ -90,7 +95,14 @@ ukb_df <- function(fileset, path = ".", n_threads = "dt", data.pos = 2) { # Comment out .r read of .tab # Read .tab file from user named path with data.table::fread # Include UKB-generated categorical variable labels - bd <- read_ukb_tab(fileset, column_type = ukb_key$fread_column_type, path, n_threads = n_threads) + bd <- read_ukb_tab(fileset, + column_type = ukb_key$fread_column_type, + path, + n_threads = n_threads, + temporary = temporary) + if (temporary) { + r_file = file.path(tempdir(), basename(r_file)) + } source(file.path(path, r_file), local = TRUE) names(bd) <- ukb_key$col.name[match(names(bd), ukb_key$field.tab)] @@ -205,7 +217,9 @@ description_to_name <- function(data) { # @param fileset prefix for UKB fileset # @param path The path to the directory containing your UKB fileset. The default value is the current directory. # -read_ukb_tab <- function(fileset, column_type, path = ".", n_threads = "max") { +read_ukb_tab <- function(fileset, column_type, path = ".", + n_threads = "max", + temporary = FALSE) { r_file <- stringr::str_interp("${fileset}.r") tab_file <- stringr::str_interp("${fileset}.tab") @@ -221,7 +235,9 @@ read_ukb_tab <- function(fileset, column_type, path = ".", n_threads = "max") { replacement = stringr::str_interp( "# Read function edited by ukbtools ${edit_date}\n# bd <-") ) - + if (temporary) { + r_location = file.path(tempdir(), basename(r_location)) + } cat(f, file = r_location, sep = "\n") bd <- data.table::fread( diff --git a/R/uk_util.R b/R/uk_util.R index 7b2ff11..655e6f9 100644 --- a/R/uk_util.R +++ b/R/uk_util.R @@ -154,6 +154,12 @@ ukb_fetch_bulk = function( file = normalizePath(file, mustWork = TRUE, winslash = "/") key = normalizePath(key, mustWork = TRUE, winslash = "/") + if (nchar(file) > 64) { + warning("File may be too long > 64 characters") + } + if (nchar(key) > 64) { + warning("key file may be too long > 64 characters") + } owd = getwd() if (!is.null(outdir)) { diff --git a/man/ukb_df.Rd b/man/ukb_df.Rd index 2f1bfed..de1f468 100644 --- a/man/ukb_df.Rd +++ b/man/ukb_df.Rd @@ -4,7 +4,7 @@ \alias{ukb_df} \title{Reads a UK Biobank phenotype fileset and returns a single dataset.} \usage{ -ukb_df(fileset, path = ".", n_threads = "dt", data.pos = 2) +ukb_df(fileset, path = ".", n_threads = "dt", data.pos = 2, temporary = FALSE) } \arguments{ \item{fileset}{The prefix for a UKB fileset, e.g., ukbxxxx (for ukbxxxx.tab, ukbxxxx.r, ukbxxxx.html)} @@ -14,6 +14,9 @@ ukb_df(fileset, path = ".", n_threads = "dt", data.pos = 2) \item{n_threads}{Either "max" (uses the number of cores, `parallel::detectCores()`), "dt" (default - uses the data.table default, `data.table::getDTthreads()`), or a numerical value (in which case n_threads is set to the supplied value, or `parallel::detectCores()` if it is smaller).} \item{data.pos}{Locates the data in your .html file. The .html file is read into a list; the default value data.pos = 2 indicates the second item in the list. (The first item in the list is the title of the table). You will probably not need to change this value, but if the need arises you can open the .html file in a browser and identify where in the file the data is.} + +\item{temporary}{Should the `R` file be copied to a temporary directory? +Useful for permissions issues, especially on computing clusters.} } \value{ A dataframe with variable names in snake_case (lowercase and separated by an underscore). From 12bb50fdf00f8225f566cc9d67b98f9f61aee674 Mon Sep 17 00:00:00 2001 From: muschellij2 Date: Mon, 29 Jun 2020 16:45:18 -0400 Subject: [PATCH 13/20] needed to pass r_file through --- R/dataset.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/dataset.R b/R/dataset.R index 510b179..85a373b 100644 --- a/R/dataset.R +++ b/R/dataset.R @@ -102,8 +102,10 @@ ukb_df <- function(fileset, path = ".", n_threads = "dt", data.pos = 2, temporary = temporary) if (temporary) { r_file = file.path(tempdir(), basename(r_file)) + } else { + r_file = file.path(path, r_file) } - source(file.path(path, r_file), local = TRUE) + source(r_file, local = TRUE) names(bd) <- ukb_key$col.name[match(names(bd), ukb_key$field.tab)] return(bd) From ca9b5cb6356a82b7c7377ba05597bfc18f9bcb99 Mon Sep 17 00:00:00 2001 From: muschellij2 Date: Wed, 22 Jul 2020 19:19:20 -0400 Subject: [PATCH 14/20] adde encoding file --- DESCRIPTION | 2 +- R/uk_util.R | 21 ++++++++++++++++++++- man/ukb_md5.Rd | 4 ++++ 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 5aac774..7abd645 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -31,7 +31,7 @@ Imports: xml2, rvest, utils -RoxygenNote: 7.1.0 +RoxygenNote: 7.1.1 Suggests: knitr, rmarkdown, diff --git a/R/uk_util.R b/R/uk_util.R index 655e6f9..d0f819c 100644 --- a/R/uk_util.R +++ b/R/uk_util.R @@ -124,19 +124,38 @@ ukb_unpack = function(file, key, ...) { #' @rdname ukb_md5 #' @param type type of conversion to do +#' @param encoding_file encoding file to map for `ukbconv`. If want no +#' encoding, set to \code{NULL} #' @export ukb_conv = function(file, + encoding_file = "encoding.ukb", type = c("r", "docs", "csv", "sas", "stata", "lims", "bulk", "txt"), ...) { type = match.arg(type) + if (!is.null(encoding_file)) { + url = paste0("http://biobank.ndph.ox.ac.uk/showcase/util/", + "encoding.ukb") + if (!file.exists(encoding_file)) { + utils::download.file(url, destfile = encoding_file, + mode = "wb") + } + } path = ukb_util_path("ukbconv", ...) - out = system2(path, c(file, type)) + args = c(file, type) + # if not default file + if (!is.null(encoding_file) && encoding_file != "encoding.ukb") { + args = c(args, "-E", encoding_file) + } + out = system2(path, args) + if (out != 0) { warning("Convert did not seem to complete successfully") } + + return(out) } diff --git a/man/ukb_md5.Rd b/man/ukb_md5.Rd index b9bbc91..fa7bd94 100644 --- a/man/ukb_md5.Rd +++ b/man/ukb_md5.Rd @@ -13,6 +13,7 @@ ukb_unpack(file, key, ...) ukb_conv( file, + encoding_file = "encoding.ukb", type = c("r", "docs", "csv", "sas", "stata", "lims", "bulk", "txt"), ... ) @@ -27,6 +28,9 @@ ukb_fetch_bulk(file, key, start = NULL, outdir = NULL, ...) \item{key}{file to key to unpack/decrypt file} +\item{encoding_file}{encoding file to map for `ukbconv`. If want no +encoding, set to \code{NULL}} + \item{type}{type of conversion to do} \item{start}{start of the fetching, 1-indexed} From f2b0ac514b8bb0599b22be259fba91183a2681c9 Mon Sep 17 00:00:00 2001 From: muschellij2 Date: Thu, 23 Jul 2020 15:13:33 -0400 Subject: [PATCH 15/20] moved the fiel to later to keep code working --- R/uk_util.R | 5 +++-- man/ukb_md5.Rd | 6 +++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/R/uk_util.R b/R/uk_util.R index d0f819c..06d7871 100644 --- a/R/uk_util.R +++ b/R/uk_util.R @@ -128,12 +128,13 @@ ukb_unpack = function(file, key, ...) { #' encoding, set to \code{NULL} #' @export ukb_conv = function(file, - encoding_file = "encoding.ukb", type = c("r", "docs", "csv", "sas", "stata", "lims", "bulk", - "txt"), ...) { + "txt"), + encoding_file = "encoding.ukb", + ...) { type = match.arg(type) if (!is.null(encoding_file)) { url = paste0("http://biobank.ndph.ox.ac.uk/showcase/util/", diff --git a/man/ukb_md5.Rd b/man/ukb_md5.Rd index fa7bd94..526aaee 100644 --- a/man/ukb_md5.Rd +++ b/man/ukb_md5.Rd @@ -13,8 +13,8 @@ ukb_unpack(file, key, ...) ukb_conv( file, - encoding_file = "encoding.ukb", type = c("r", "docs", "csv", "sas", "stata", "lims", "bulk", "txt"), + encoding_file = "encoding.ukb", ... ) @@ -28,11 +28,11 @@ ukb_fetch_bulk(file, key, start = NULL, outdir = NULL, ...) \item{key}{file to key to unpack/decrypt file} +\item{type}{type of conversion to do} + \item{encoding_file}{encoding file to map for `ukbconv`. If want no encoding, set to \code{NULL}} -\item{type}{type of conversion to do} - \item{start}{start of the fetching, 1-indexed} \item{outdir}{output directory of download} From b6a521f3860241b19782610a755835bee33ffbdb Mon Sep 17 00:00:00 2001 From: muschellij2 Date: Thu, 23 Jul 2020 15:51:53 -0400 Subject: [PATCH 16/20] updated fileset --- R/dataset.R | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/R/dataset.R b/R/dataset.R index 85a373b..0774506 100644 --- a/R/dataset.R +++ b/R/dataset.R @@ -6,6 +6,7 @@ globalVariables( "Kinship", "categorized_var", "dx", "freq", "tile_range", "lower", "upper", "mid", "frequency", "disease")) + #' Reads a UK Biobank phenotype fileset and returns a single dataset. #' #' A UK Biobank \emph{fileset} includes a \emph{.tab} file containing the raw data with field codes instead of variable names, an \emph{.r} (\emph{sic}) file containing code to read raw data (inserts categorical variable levels and labels), and an \emph{.html} file containing tables mapping field code to variable name, and labels and levels for categorical variables. @@ -140,6 +141,8 @@ ukb_df <- function(fileset, path = ".", n_threads = "dt", data.pos = 2, #' } #' ukb_df_field <- function(fileset, path = ".", data.pos = 2, as.lookup = FALSE) { + fileset = stringr::str_replace(fileset, "[.](r|html|tab)$", "") + html_file <- stringr::str_interp("${fileset}.html") html_internal_doc <- xml2::read_html(file.path(path, html_file)) html_table_nodes <- xml2::xml_find_all(html_internal_doc, "//table") @@ -222,6 +225,8 @@ description_to_name <- function(data) { read_ukb_tab <- function(fileset, column_type, path = ".", n_threads = "max", temporary = FALSE) { + fileset = stringr::str_replace(fileset, "[.](r|html|tab)$", "") + r_file <- stringr::str_interp("${fileset}.r") tab_file <- stringr::str_interp("${fileset}.tab") From a903ac6329b1df6f4339646bb776eb3516a0b8e3 Mon Sep 17 00:00:00 2001 From: muschellij2 Date: Mon, 24 Aug 2020 15:33:05 -0400 Subject: [PATCH 17/20] updated ukb_df to have withdraw_file added --- R/dataset.R | 34 ++++++++++++++++++++++++++++++++-- R/uk_util.R | 3 +-- man/ukb_df.Rd | 12 +++++++++++- 3 files changed, 44 insertions(+), 5 deletions(-) diff --git a/R/dataset.R b/R/dataset.R index 0774506..d7a55f4 100644 --- a/R/dataset.R +++ b/R/dataset.R @@ -17,6 +17,8 @@ globalVariables( #' @param data.pos Locates the data in your .html file. The .html file is read into a list; the default value data.pos = 2 indicates the second item in the list. (The first item in the list is the title of the table). You will probably not need to change this value, but if the need arises you can open the .html file in a browser and identify where in the file the data is. #' @param temporary Should the `R` file be copied to a temporary directory? #' Useful for permissions issues, especially on computing clusters. +#' @param withdraw_file file of identifiers of those who have +#' withdrawn from UK Biobank to exclude from the data set. #' #' @details The \strong{index} and \strong{array} from the UKB field code are preserved in the variable name, as two numbers separated by underscores at the end of the name e.g. \emph{variable_index_array}. \strong{index} refers the assessment instance (or visit). \strong{array} captures multiple answers to the same "question". See UKB documentation for detailed descriptions of \href{http://biobank.ctsu.ox.ac.uk/crystal/instance.cgi?id=2}{index} and \href{http://biobank.ctsu.ox.ac.uk/crystal/help.cgi?cd=array}{array}. #' @@ -49,7 +51,7 @@ globalVariables( #' } #' ukb_df <- function(fileset, path = ".", n_threads = "dt", data.pos = 2, - temporary = FALSE) { + temporary = FALSE, withdraw_file = NULL) { fileset = stringr::str_replace(fileset, "[.](r|html|tab)$", "") @@ -76,7 +78,32 @@ ukb_df <- function(fileset, path = ".", n_threads = "dt", data.pos = 2, ) ukb_key <- ukb_df_field(fileset, path = path) %>% - mutate(fread_column_type = col_type[col.type]) + dplyr::mutate(fread_column_type = col_type[col.type]) + + withdraw_ids = NULL + if (file.exists(withdraw_file)) { + withdraw_ids <- data.table::fread( + input = withdraw_file, + sep = "\t", + header = FALSE, + data.table = FALSE, + showProgress = FALSE, + nThread = if(n_threads == "max") { + parallel::detectCores() + } else if (n_threads == "dt") { + data.table::getDTthreads() + } else if (is.numeric(n_threads)) { + min(n_threads, parallel::detectCores()) + } + ) + if (ncol(withdraw_ids) > 1) + warning( + paste0( + "withdrawal_file has multiple columns, ", + "it should not be just one column (no header) of IDs") + ) + withdraw_ids = withdraw_ids[[1]] + } bad_col_type <- is.na(ukb_key$fread_column_type) @@ -109,6 +136,9 @@ ukb_df <- function(fileset, path = ".", n_threads = "dt", data.pos = 2, source(r_file, local = TRUE) names(bd) <- ukb_key$col.name[match(names(bd), ukb_key$field.tab)] + if (!is.null(withdraw_ids)) { + bd = bd[ !bd$eid %in% withdraw_ids, ] + } return(bd) } diff --git a/R/uk_util.R b/R/uk_util.R index 06d7871..a830967 100644 --- a/R/uk_util.R +++ b/R/uk_util.R @@ -84,7 +84,7 @@ ukb_util_path = function( #' @rdname ukb_util_path #' @export -ukb_encoding = function( outdir = tempdir()) { +ukb_encoding = function(outdir = tempdir()) { res = ukb_util_path(util = "encoding.ukb", download = TRUE, outdir = outdir) @@ -156,7 +156,6 @@ ukb_conv = function(file, warning("Convert did not seem to complete successfully") } - return(out) } diff --git a/man/ukb_df.Rd b/man/ukb_df.Rd index de1f468..87abbf3 100644 --- a/man/ukb_df.Rd +++ b/man/ukb_df.Rd @@ -4,7 +4,14 @@ \alias{ukb_df} \title{Reads a UK Biobank phenotype fileset and returns a single dataset.} \usage{ -ukb_df(fileset, path = ".", n_threads = "dt", data.pos = 2, temporary = FALSE) +ukb_df( + fileset, + path = ".", + n_threads = "dt", + data.pos = 2, + temporary = FALSE, + withdraw_file = NULL +) } \arguments{ \item{fileset}{The prefix for a UKB fileset, e.g., ukbxxxx (for ukbxxxx.tab, ukbxxxx.r, ukbxxxx.html)} @@ -17,6 +24,9 @@ ukb_df(fileset, path = ".", n_threads = "dt", data.pos = 2, temporary = FALSE) \item{temporary}{Should the `R` file be copied to a temporary directory? Useful for permissions issues, especially on computing clusters.} + +\item{withdraw_file}{file of identifiers of those who have +withdrawn from UK Biobank to exclude from the data set.} } \value{ A dataframe with variable names in snake_case (lowercase and separated by an underscore). From 5d666b8ab5cda8bfc25408075b8b4a45a15c9adc Mon Sep 17 00:00:00 2001 From: muschellij2 Date: Mon, 24 Aug 2020 16:50:20 -0400 Subject: [PATCH 18/20] added warning in case eid is not available --- R/dataset.R | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/R/dataset.R b/R/dataset.R index d7a55f4..7c9c2c7 100644 --- a/R/dataset.R +++ b/R/dataset.R @@ -99,8 +99,8 @@ ukb_df <- function(fileset, path = ".", n_threads = "dt", data.pos = 2, if (ncol(withdraw_ids) > 1) warning( paste0( - "withdrawal_file has multiple columns, ", - "it should not be just one column (no header) of IDs") + "withdrawal_file has multiple columns, ", + "it should not be just one column (no header) of IDs") ) withdraw_ids = withdraw_ids[[1]] } @@ -137,7 +137,15 @@ ukb_df <- function(fileset, path = ".", n_threads = "dt", data.pos = 2, names(bd) <- ukb_key$col.name[match(names(bd), ukb_key$field.tab)] if (!is.null(withdraw_ids)) { - bd = bd[ !bd$eid %in% withdraw_ids, ] + if ("eid" %in% colnames(bd)) { + bd = bd[ !bd$eid %in% withdraw_ids, ] + } else { + warning( + paste0( + "eid not in data set column name and withdraw IDs are given,", + " no records were dropped!") + ) + } } return(bd) } From 3182a8251fa597dcd06d5aeca262a09e79319e01 Mon Sep 17 00:00:00 2001 From: kenhanscombe Date: Sun, 30 Aug 2020 20:20:38 +0100 Subject: [PATCH 19/20] Code style and naming updates Edited code style to match ukbtools package, and ukbtools and UKB naming conventions. Also added documentation for each function. --- NAMESPACE | 12 +-- NEWS.md | 4 +- R/{uk_util.R => filehandlers.R} | 149 ++++++++++++++++++++------------ man/ukb_md5.Rd | 45 ---------- man/ukb_util_conv.Rd | 27 ++++++ man/ukb_util_encoding.Rd | 14 +++ man/ukb_util_fetch.Rd | 23 +++++ man/ukb_util_get.Rd | 30 +++++++ man/ukb_util_md5.Rd | 20 +++++ man/ukb_util_path.Rd | 33 ------- man/ukb_util_unpack.Rd | 19 ++++ 11 files changed, 234 insertions(+), 142 deletions(-) rename R/{uk_util.R => filehandlers.R} (52%) delete mode 100644 man/ukb_md5.Rd create mode 100644 man/ukb_util_conv.Rd create mode 100644 man/ukb_util_encoding.Rd create mode 100644 man/ukb_util_fetch.Rd create mode 100644 man/ukb_util_get.Rd create mode 100644 man/ukb_util_md5.Rd delete mode 100644 man/ukb_util_path.Rd create mode 100644 man/ukb_util_unpack.Rd diff --git a/NAMESPACE b/NAMESPACE index a7e34b4..5826c91 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,13 +2,10 @@ export(ukb_centre) export(ukb_context) -export(ukb_conv) export(ukb_df) export(ukb_df_duplicated_name) export(ukb_df_field) export(ukb_df_full_join) -export(ukb_encoding) -export(ukb_fetch_bulk) export(ukb_gen_excl) export(ukb_gen_excl_to_na) export(ukb_gen_het) @@ -29,9 +26,12 @@ export(ukb_icd_diagnosis) export(ukb_icd_freq_by) export(ukb_icd_keyword) export(ukb_icd_prevalence) -export(ukb_md5) -export(ukb_unpack) -export(ukb_util_path) +export(ukb_util_conv) +export(ukb_util_encoding) +export(ukb_util_fetch) +export(ukb_util_get) +export(ukb_util_md5) +export(ukb_util_unpack) import(dplyr) import(ggplot2) import(grid) diff --git a/NEWS.md b/NEWS.md index d520a20..45697ab 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,9 +3,9 @@ Bug fix: -dplyr update broke `ukb_icd_diagnosis`. Fixed in dev version. +* dplyr update broke `ukb_icd_diagnosis`. Fixed in dev version. -* Added `ukb_util_*` functions to be able to perform system-level calls for fetching and unpacking. +* Added `ukb_util_*` functions to be able to perform system-level calls for fetching, unpacking, converting UKB files. diff --git a/R/uk_util.R b/R/filehandlers.R similarity index 52% rename from R/uk_util.R rename to R/filehandlers.R index a830967..a5feecb 100644 --- a/R/uk_util.R +++ b/R/filehandlers.R @@ -1,6 +1,9 @@ + os_type <- function() { .Platform$OS.type } + + sys_type <- function() { if (os_type() == "windows") { "windows" @@ -16,24 +19,24 @@ sys_type <- function() { } } -#' Get Path to UKB Utilitiy -#' -#' @param util Name of the utility -#' @param download Should the utility be downloaded if not found? -#' @param outdir The output directory to download the utility + +#' Downloads individual UKB utlities and file handlers. #' -#' @return A path to the utility +#' @param util Name of the utility. Must be one of \code{"ukbmd5"}, \code{"ukbconv"}, \code{"ukbunpack"}, \code{"ukbfetch"}, \code{"ukblink"}, \code{"ukbgene"}, \code{"encoding.ukb"}. For a description of the UKB utilies and file handlers, see \href{http://biobank.ndph.ox.ac.uk/showcase/download.cgi}{UKB Downloads}. +#' @param download Should the utility be downloaded if not found? Default is \code{TRUE}. +#' @param out_dir The output directory to download the UKB utility to. Default \code{tempdir()} - UKB utility and temporary directory will be deleted at the end of the current session. +#' @return Path to the downloaded utility. #' @export #' #' @examples -#' md5 = ukb_util_path("ukbmd5") +#' md5 = ukb_util_get("ukbmd5") #' file.remove(md5) -ukb_util_path = function( +ukb_util_get <- function( util = c("ukbmd5", "ukbconv", "ukbunpack", "ukbfetch", "ukblink", "ukbgene", "encoding.ukb"), download = TRUE, - outdir = tempdir()) { + out_dir = tempdir()) { util = match.arg(util) st = sys_type() @@ -44,75 +47,96 @@ ukb_util_path = function( ext = ".exe" } - tool_path = Sys.which(util) - if (nzchar(tool_path)) { - return(tool_path) + exec_path = Sys.which(util) + if (nzchar(exec_path)) { + return(exec_path) } if (download) { + util_url = paste0("http://biobank.ndph.ox.ac.uk/showcase/util/", + util, ext) + dest_file = file.path(out_dir, basename(util_url)) + + if (!file.exists(dest_file)) { + utils::download.file(util_url, destfile = dest_file, + mode = "wb") + } if (st %in% "macos") { noah = Sys.which("noah") if (!nzchar(noah)) { warning( paste0( - "You may need noah to use these ", - "linux execs on Mac OSX,", - " See https://github.com/linux-noah/noah ") + "You may need noah to use this UKB utility ", + "(a linux executable) on Mac OSX.", + " See https://github.com/linux-noah/noah"), + call. = FALSE ) } } - url = paste0("http://biobank.ndph.ox.ac.uk/showcase/util/", - util, ext) - destfile = file.path(outdir, basename(url)) - if (!file.exists(destfile)) { - utils::download.file(url, destfile = destfile, - mode = "wb") - } - Sys.chmod(destfile) - return(destfile) + Sys.chmod(dest_file) + return(dest_file) + } else { + stop( paste0("Cannot find tool: ", util, - ", may need to modify PATH", + ", you may need to modify your PATH", "so that Sys.which('", util, "')", - "returns the path to the tool") + "returns the path to the utility.") ) } } -#' @rdname ukb_util_path + + + +#' Downloads encoding dictionaries for use with ukb_util_conv. +#' +#' @param out_dir The output directory to download the UKB utility to. Default \code{tempdir()}. +#' +# #' @rdname ukb_util_get #' @export -ukb_encoding = function(outdir = tempdir()) { - res = ukb_util_path(util = "encoding.ukb", +ukb_util_encoding <- function(out_dir = tempdir()) { + res = ukb_util_get(util = "encoding.ukb", download = TRUE, - outdir = outdir) + out_dir = out_dir) } -#' UKB MD5 Checksum + + + +#' Calculates size and MD5 of a UKB utlity file. #' -#' @param file name of file to run utility on -#' @param ... additional arguments to pass to -#' \code{\link{ukb_util_path}} +#' @param file Path to file to run MD5 utility on. +#' @param ... Additional arguments to pass to +#' \code{\link{ukb_util_get}}. #' #' @return A character string #' #' @export -ukb_md5 = function(file, ...) { - path = ukb_util_path("ukbmd5", ...) - out = system2(path, file, stdout = TRUE) +ukb_util_md5 <- function(file, ...) { + path = ukb_util_get("ukbmd5", ...) + out = system2(path, args = file, stdout = TRUE) out = out[grepl("MD5=", out)] out = sub(".*MD5=", "", out) return(out) } -#' @rdname ukb_md5 + + +#' Unpacks (decrypts and decompresses) UKB data. +#' +# #' @rdname ukb_util_md5 +#' @param file Path to file to unpack/decrypt. #' @param key file to key to unpack/decrypt file +#' @param ... Additional arguments to pass to +#' \code{\link{ukb_util_get}}. #' @export -ukb_unpack = function(file, key, ...) { - path = ukb_util_path("ukbunpack", ...) +ukb_util_unpack <- function(file, key, ...) { + path = ukb_util_get("ukbunpack", ...) out = system2(path, c(file, key)) if (out != 0) { warning("Unpacking did not seem to complete successfully") @@ -122,12 +146,19 @@ ukb_unpack = function(file, key, ...) { } -#' @rdname ukb_md5 -#' @param type type of conversion to do + + +#' Converts unpacked UKB data to other formats. +#' +# #' @rdname ukb_util_md5 +#' @param file Path to decrypted file to convert. +#' @param type Type of conversion to do. #' @param encoding_file encoding file to map for `ukbconv`. If want no #' encoding, set to \code{NULL} +#' @param ... Additional arguments to pass to +#' \code{\link{ukb_util_get}}. #' @export -ukb_conv = function(file, +ukb_util_conv <- function(file, type = c("r", "docs", "csv", "sas", "stata", @@ -144,7 +175,7 @@ ukb_conv = function(file, mode = "wb") } } - path = ukb_util_path("ukbconv", ...) + path = ukb_util_get("ukbconv", ...) args = c(file, type) # if not default file if (!is.null(encoding_file) && encoding_file != "encoding.ukb") { @@ -159,30 +190,36 @@ ukb_conv = function(file, return(out) } -#' @rdname ukb_md5 + + + +#' Downloads approved bulk data files. +#' +# #' @rdname ukb_util_md5 +#' @param file Path to ?. +#' @param key Path to key file. #' @param start start of the fetching, 1-indexed -#' @param outdir output directory of download +#' @param out_dir output directory of download +#' @param ... Additional arguments to pass to +#' \code{\link{ukb_util_get}}. #' @export -ukb_fetch_bulk = function( - file, - key, - start = NULL, - outdir = NULL, - ...) { +ukb_util_fetch <- function(file, key, start = NULL, out_dir = NULL, ...) { stopifnot(file.exists(file)) file = normalizePath(file, mustWork = TRUE, winslash = "/") key = normalizePath(key, mustWork = TRUE, winslash = "/") + if (nchar(file) > 64) { warning("File may be too long > 64 characters") } + if (nchar(key) > 64) { - warning("key file may be too long > 64 characters") + warning("Key file may be too long > 64 characters") } owd = getwd() - if (!is.null(outdir)) { - setwd(outdir) + if (!is.null(out_dir)) { + setwd(out_dir) on.exit({ setwd(owd) }, add = TRUE) @@ -199,7 +236,7 @@ ukb_fetch_bulk = function( } } - path = ukb_util_path("ukbfetch", ...) + path = ukb_util_get("ukbfetch", ...) bfile = paste0("-b", file) akey = paste0("-a", key) diff --git a/man/ukb_md5.Rd b/man/ukb_md5.Rd deleted file mode 100644 index 526aaee..0000000 --- a/man/ukb_md5.Rd +++ /dev/null @@ -1,45 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/uk_util.R -\name{ukb_md5} -\alias{ukb_md5} -\alias{ukb_unpack} -\alias{ukb_conv} -\alias{ukb_fetch_bulk} -\title{UKB MD5 Checksum} -\usage{ -ukb_md5(file, ...) - -ukb_unpack(file, key, ...) - -ukb_conv( - file, - type = c("r", "docs", "csv", "sas", "stata", "lims", "bulk", "txt"), - encoding_file = "encoding.ukb", - ... -) - -ukb_fetch_bulk(file, key, start = NULL, outdir = NULL, ...) -} -\arguments{ -\item{file}{name of file to run utility on} - -\item{...}{additional arguments to pass to -\code{\link{ukb_util_path}}} - -\item{key}{file to key to unpack/decrypt file} - -\item{type}{type of conversion to do} - -\item{encoding_file}{encoding file to map for `ukbconv`. If want no -encoding, set to \code{NULL}} - -\item{start}{start of the fetching, 1-indexed} - -\item{outdir}{output directory of download} -} -\value{ -A character string -} -\description{ -UKB MD5 Checksum -} diff --git a/man/ukb_util_conv.Rd b/man/ukb_util_conv.Rd new file mode 100644 index 0000000..764a82a --- /dev/null +++ b/man/ukb_util_conv.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filehandlers.R +\name{ukb_util_conv} +\alias{ukb_util_conv} +\title{Convert unpacked UKB data to other formats.} +\usage{ +ukb_util_conv( + file, + type = c("r", "docs", "csv", "sas", "stata", "lims", "bulk", "txt"), + encoding_file = "encoding.ukb", + ... +) +} +\arguments{ +\item{file}{Path to decrypted file to convert.} + +\item{type}{Type of conversion to do.} + +\item{encoding_file}{encoding file to map for `ukbconv`. If want no +encoding, set to \code{NULL}} + +\item{...}{Additional arguments to pass to +\code{\link{ukb_util_get}}.} +} +\description{ +Convert unpacked UKB data to other formats. +} diff --git a/man/ukb_util_encoding.Rd b/man/ukb_util_encoding.Rd new file mode 100644 index 0000000..7ec1aee --- /dev/null +++ b/man/ukb_util_encoding.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filehandlers.R +\name{ukb_util_encoding} +\alias{ukb_util_encoding} +\title{Downloads encoding dictionaries for use with ukb_util_conv.} +\usage{ +ukb_util_encoding(out_dir = tempdir()) +} +\arguments{ +\item{out_dir}{The output directory to download the UKB utility to. Default \code{tempdir()}.} +} +\description{ +Downloads encoding dictionaries for use with ukb_util_conv. +} diff --git a/man/ukb_util_fetch.Rd b/man/ukb_util_fetch.Rd new file mode 100644 index 0000000..02dbcec --- /dev/null +++ b/man/ukb_util_fetch.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filehandlers.R +\name{ukb_util_fetch} +\alias{ukb_util_fetch} +\title{Downloads approved bulk data files.} +\usage{ +ukb_util_fetch(file, key, start = NULL, out_dir = NULL, ...) +} +\arguments{ +\item{file}{Path to decrypted file to convert.} + +\item{key}{Path to key to unpack/decrypt file.} + +\item{start}{start of the fetching, 1-indexed} + +\item{out_dir}{output directory of download} + +\item{...}{Additional arguments to pass to +\code{\link{ukb_util_get}}.} +} +\description{ +Downloads approved bulk data files. +} diff --git a/man/ukb_util_get.Rd b/man/ukb_util_get.Rd new file mode 100644 index 0000000..fed91cf --- /dev/null +++ b/man/ukb_util_get.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filehandlers.R +\name{ukb_util_get} +\alias{ukb_util_get} +\title{Downloads individual UKB utlities and file handlers.} +\usage{ +ukb_util_get( + util = c("ukbmd5", "ukbconv", "ukbunpack", "ukbfetch", "ukblink", "ukbgene", + "encoding.ukb"), + download = TRUE, + out_dir = tempdir() +) +} +\arguments{ +\item{util}{Name of the utility. Must be one of \code{"ukbmd5"}, \code{"ukbconv"}, \code{"ukbunpack"}, \code{"ukbfetch"}, \code{"ukblink"}, \code{"ukbgene"}, \code{"encoding.ukb"}. For a description of the UKB utilies and file handlers, see \href{http://biobank.ndph.ox.ac.uk/showcase/download.cgi}{UKB Downloads}.} + +\item{download}{Should the utility be downloaded if not found? Default is \code{TRUE}.} + +\item{out_dir}{The output directory to download the UKB utility to. Default \code{tempdir()}.} +} +\value{ +Path to the downloaded utility. +} +\description{ +Downloads individual UKB utlities and file handlers. +} +\examples{ +md5 = ukb_util_get("ukbmd5") +file.remove(md5) +} diff --git a/man/ukb_util_md5.Rd b/man/ukb_util_md5.Rd new file mode 100644 index 0000000..78671fa --- /dev/null +++ b/man/ukb_util_md5.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filehandlers.R +\name{ukb_util_md5} +\alias{ukb_util_md5} +\title{Calculates size and MD5 of a UKB utlity file.} +\usage{ +ukb_util_md5(file, ...) +} +\arguments{ +\item{file}{Path to file to run MD5 utility on.} + +\item{...}{Additional arguments to pass to +\code{\link{ukb_util_get}}.} +} +\value{ +A character string +} +\description{ +Calculates size and MD5 of a UKB utlity file. +} diff --git a/man/ukb_util_path.Rd b/man/ukb_util_path.Rd deleted file mode 100644 index 722b61d..0000000 --- a/man/ukb_util_path.Rd +++ /dev/null @@ -1,33 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/uk_util.R -\name{ukb_util_path} -\alias{ukb_util_path} -\alias{ukb_encoding} -\title{Get Path to UKB Utilitiy} -\usage{ -ukb_util_path( - util = c("ukbmd5", "ukbconv", "ukbunpack", "ukbfetch", "ukblink", "ukbgene", - "encoding.ukb"), - download = TRUE, - outdir = tempdir() -) - -ukb_encoding(outdir = tempdir()) -} -\arguments{ -\item{util}{Name of the utility} - -\item{download}{Should the utility be downloaded if not found?} - -\item{outdir}{The output directory to download the utility} -} -\value{ -A path to the utility -} -\description{ -Get Path to UKB Utilitiy -} -\examples{ -md5 = ukb_util_path("ukbmd5") -file.remove(md5) -} diff --git a/man/ukb_util_unpack.Rd b/man/ukb_util_unpack.Rd new file mode 100644 index 0000000..bebb820 --- /dev/null +++ b/man/ukb_util_unpack.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filehandlers.R +\name{ukb_util_unpack} +\alias{ukb_util_unpack} +\title{Unpacks (decrypts and decompresses) UKB data.} +\usage{ +ukb_util_unpack(file, key, ...) +} +\arguments{ +\item{file}{Path to file to unpack/decrypt.} + +\item{key}{file to key to unpack/decrypt file} + +\item{...}{Additional arguments to pass to +\code{\link{ukb_util_get}}.} +} +\description{ +Unpacks (decrypts and decompresses) UKB data. +} From 9432ad80a8971ba23b68c9f5e59b6f54a3a0cdc6 Mon Sep 17 00:00:00 2001 From: muschellij2 Date: Wed, 16 Dec 2020 12:28:17 -0500 Subject: [PATCH 20/20] updated the withdrawal_file argument --- R/dataset.R | 2 +- R/filehandlers.R | 2 +- man/ukb_util_conv.Rd | 4 ++-- man/ukb_util_fetch.Rd | 4 ++-- man/ukb_util_get.Rd | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/R/dataset.R b/R/dataset.R index 7c9c2c7..b35bcc1 100644 --- a/R/dataset.R +++ b/R/dataset.R @@ -81,7 +81,7 @@ ukb_df <- function(fileset, path = ".", n_threads = "dt", data.pos = 2, dplyr::mutate(fread_column_type = col_type[col.type]) withdraw_ids = NULL - if (file.exists(withdraw_file)) { + if (!is.null(withdraw_file) && file.exists(withdraw_file)) { withdraw_ids <- data.table::fread( input = withdraw_file, sep = "\t", diff --git a/R/filehandlers.R b/R/filehandlers.R index a5feecb..890b311 100644 --- a/R/filehandlers.R +++ b/R/filehandlers.R @@ -196,7 +196,7 @@ ukb_util_conv <- function(file, #' Downloads approved bulk data files. #' # #' @rdname ukb_util_md5 -#' @param file Path to ?. +#' @param file Path to bulk file #' @param key Path to key file. #' @param start start of the fetching, 1-indexed #' @param out_dir output directory of download diff --git a/man/ukb_util_conv.Rd b/man/ukb_util_conv.Rd index 764a82a..ab4f75f 100644 --- a/man/ukb_util_conv.Rd +++ b/man/ukb_util_conv.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/filehandlers.R \name{ukb_util_conv} \alias{ukb_util_conv} -\title{Convert unpacked UKB data to other formats.} +\title{Converts unpacked UKB data to other formats.} \usage{ ukb_util_conv( file, @@ -23,5 +23,5 @@ encoding, set to \code{NULL}} \code{\link{ukb_util_get}}.} } \description{ -Convert unpacked UKB data to other formats. +Converts unpacked UKB data to other formats. } diff --git a/man/ukb_util_fetch.Rd b/man/ukb_util_fetch.Rd index 02dbcec..8a7acbd 100644 --- a/man/ukb_util_fetch.Rd +++ b/man/ukb_util_fetch.Rd @@ -7,9 +7,9 @@ ukb_util_fetch(file, key, start = NULL, out_dir = NULL, ...) } \arguments{ -\item{file}{Path to decrypted file to convert.} +\item{file}{Path to bulk file} -\item{key}{Path to key to unpack/decrypt file.} +\item{key}{Path to key file.} \item{start}{start of the fetching, 1-indexed} diff --git a/man/ukb_util_get.Rd b/man/ukb_util_get.Rd index fed91cf..ccbeaf6 100644 --- a/man/ukb_util_get.Rd +++ b/man/ukb_util_get.Rd @@ -16,7 +16,7 @@ ukb_util_get( \item{download}{Should the utility be downloaded if not found? Default is \code{TRUE}.} -\item{out_dir}{The output directory to download the UKB utility to. Default \code{tempdir()}.} +\item{out_dir}{The output directory to download the UKB utility to. Default \code{tempdir()} - UKB utility and temporary directory will be deleted at the end of the current session.} } \value{ Path to the downloaded utility.