This is an automated email from the git hooks/post-receive script. tille pushed a commit to branch master in repository r-bioc-genomeinfodbdata.
commit c445368d87ffbd9c2b6811376f31c443aaf596c2 Author: Andreas Tille <[email protected]> Date: Fri Sep 29 13:13:20 2017 +0200 New upstream version 0.99.0 --- DESCRIPTION | 10 ++++ NAMESPACE | 1 + data/specData.rda | Bin 0 -> 7044532 bytes data/speciesMap.rda | Bin 0 -> 8772520 bytes data/validTaxIds.rda | Bin 0 -> 416192 bytes debian/README.source | 16 ----- debian/changelog | 5 -- debian/compat | 1 - debian/control | 20 ------- debian/copyright | 106 ---------------------------------- debian/rules | 4 -- debian/source/format | 1 - debian/watch | 3 - inst/scripts/updateGenomeInfoDbData.R | 65 +++++++++++++++++++++ man/GenomeInfoDbData-package.Rd | 43 ++++++++++++++ 15 files changed, 119 insertions(+), 156 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..b12ffd9 --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,10 @@ +Package: GenomeInfoDbData +Title: Species and taxonomy ID look up tables used by GenomeInfoDb +Description: Files for mapping between NCBI taxonomy ID and species. Used + by functions in the GenomeInfoDb package. +Version: 0.99.0 +Author: Bioconductor Core Team +Maintainer: Bioconductor Maintainer <[email protected]> +Depends: R (>= 3.3) +biocViews: AnnotationData, Organism +License: Artistic-2.0 diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/NAMESPACE @@ -0,0 +1 @@ + diff --git a/data/specData.rda b/data/specData.rda new file mode 100644 index 0000000..69b9848 Binary files /dev/null and b/data/specData.rda differ diff --git a/data/speciesMap.rda b/data/speciesMap.rda new file mode 100644 index 0000000..86807c2 Binary files /dev/null and b/data/speciesMap.rda differ diff --git a/data/validTaxIds.rda b/data/validTaxIds.rda new file mode 100644 index 0000000..5a2866b Binary files /dev/null and b/data/validTaxIds.rda differ diff --git a/debian/README.source b/debian/README.source deleted file mode 100644 index 4ad890c..0000000 --- a/debian/README.source +++ /dev/null @@ -1,16 +0,0 @@ -This package contains three mapping objects: - -* data/speciesMap.rda: A data frame with columns 'tax_id', 'genus', and - 'species'. Used to retrieve taxonomy ID by species and returns list of - available species. - -* data/validTaxIds.rda: An integer vector of valid taxonomy IDs created - from 'speciesMap'. Used internally for quick taxonomy ID look ups. - -* data/specData.rds: A data frame with columns 'taxon' and 'species'. - Used internally to retrieve species by taxonomy ID. - - -Scripts to generate these files are in inst/scripts. -All originate from the public taxonomy dump at -ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz diff --git a/debian/changelog b/debian/changelog deleted file mode 100644 index 17bd319..0000000 --- a/debian/changelog +++ /dev/null @@ -1,5 +0,0 @@ -r-bioc-genomeinfodbdata (0.99.0-1) unstable; urgency=medium - - * Initial release (Closes: #862550) - - -- Graham Inggs <[email protected]> Mon, 15 May 2017 16:12:39 +0200 diff --git a/debian/compat b/debian/compat deleted file mode 100644 index f599e28..0000000 --- a/debian/compat +++ /dev/null @@ -1 +0,0 @@ -10 diff --git a/debian/control b/debian/control deleted file mode 100644 index 032a012..0000000 --- a/debian/control +++ /dev/null @@ -1,20 +0,0 @@ -Source: r-bioc-genomeinfodbdata -Section: gnu-r -Priority: optional -Maintainer: Debian Med Packaging Team <[email protected]> -Uploaders: Graham Inggs <[email protected]> -Build-Depends: debhelper (>= 10), dh-r, r-base-dev -Standards-Version: 3.9.8 -Homepage: https://bioconductor.org/packages/GenomeInfoDbData/ -Vcs-Browser: https://anonscm.debian.org/viewvc/debian-med/trunk/packages/R/r-bioc-genomeinfodbdata/trunk/ -Vcs-Svn: svn://anonscm.debian.org/debian-med/trunk/packages/R/r-bioc-genomeinfodbdata/trunk/ - -Package: r-bioc-genomeinfodbdata -Architecture: all -Depends: ${R:Depends}, ${misc:Depends}, ${shlibs:Depends} -Recommends: ${R:Recommends} -Suggests: ${R:Suggests} -Description: BioConductor species and taxonomy ID look up tables - This package contains files for mapping between NCBI taxonomy ID and species. - . - It is used by functions in the r-bioc-genomeinfodb package. diff --git a/debian/copyright b/debian/copyright deleted file mode 100644 index b7e6571..0000000 --- a/debian/copyright +++ /dev/null @@ -1,106 +0,0 @@ -Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ -Upstream-Name: GenomeInfoDbData -Upstream-Contact: Bioconductor Maintainer <[email protected]> -Source: https://bioconductor.org/packages/GenomeInfoDbData/ - -Files: * -Copyright: 2006-2017 Sonali Arora, Martin Morgan, Marc Carlson, H. Pagès -License: Artistic-2.0 - -Files: debian/* -Copyright: 2017 Graham Inggs <[email protected]> -License: Artistic-2.0 - -License: Artistic-2.0 - The "Artistic License" - . - Preamble - . - 1. You may make and give away verbatim copies of the source form of the - Standard Version of this Package without restriction, provided that - you duplicate all of the original copyright notices and associated - disclaimers. - . - 2. You may apply bug fixes, portability fixes and other modifications - derived from the Public Domain or from the Copyright Holder. A - Package modified in such a way shall still be considered the Standard - Version. - . - 3. You may otherwise modify your copy of this Package in any way, - provided that you insert a prominent notice in each changed file stating - how and when you changed that file, and provided that you do at least - ONE of the following: - . - a) place your modifications in the Public Domain or otherwise make them - Freely Available, such as by posting said modifications to Usenet or - an equivalent medium, or placing the modifications on a major archive - site such as uunet.uu.net, or by allowing the Copyright Holder to include - your modifications in the Standard Version of the Package. - . - b) use the modified Package only within your corporation or organization. - . - c) rename any non-standard executables so the names do not conflict - with standard executables, which must also be provided, and provide - a separate manual page for each non-standard executable that clearly - documents how it differs from the Standard Version. - . - d) make other distribution arrangements with the Copyright Holder. - . - 4. You may distribute the programs of this Package in object code or - executable form, provided that you do at least ONE of the following: - . - a) distribute a Standard Version of the executables and library files, - together with instructions (in the manual page or equivalent) on where - to get the Standard Version. - . - b) accompany the distribution with the machine-readable source of - the Package with your modifications. - . - c) give non-standard executables non-standard names, and clearly - document the differences in manual pages (or equivalent), together - with instructions on where to get the Standard Version. - . - d) make other distribution arrangements with the Copyright Holder. - . - 5. You may charge a reasonable copying fee for any distribution of this - Package. You may charge any fee you choose for support of this Package. - You may not charge a fee for this Package itself. However, you may - distribute this Package in aggregate with other (possibly commercial) - programs as part of a larger (possibly commercial) software distribution - provided that you do not advertise this Package as a product of your - own. You may embed this Package's interpreter within an executable of - yours (by linking); this shall be construed as a mere form of - aggregation, provided that the complete Standard Version of the - interpreter is so embedded. - . - 6. The scripts and library files supplied as input to or produced as - output from the programs of this Package do not automatically fall under - the copyright of this Package, but belong to whoever generated them, and - may be sold commercially, and may be aggregated with this Package. If - such scripts or library files are aggregated with this Package via the - so-called "undump" or "unexec" methods of producing a binary executable - image, then distribution of such an image shall neither be construed as - a distribution of this Package nor shall it fall under the restrictions - of Paragraphs 3 and 4, provided that you do not represent such an - executable image as a Standard Version of this Package. - . - 7. C subroutines (or comparably compiled subroutines in other - languages) supplied by you and linked into this Package in order to - emulate subroutines and variables of the language defined by this - Package shall not be considered part of this Package, but are the - equivalent of input as in Paragraph 6, provided these subroutines do - not change the language in any way that would cause it to fail the - regression tests for the language. - . - 8. Aggregation of this Package with a commercial distribution is always - permitted provided that the use of this Package is embedded; that is, - when no overt attempt is made to make this Package's interfaces visible - to the end user of the commercial distribution. Such use shall not be - construed as a distribution of this Package. - . - 9. The name of the Copyright Holder may not be used to endorse or promote - products derived from this software without specific prior written permission. - . - 10. THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR - IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED - WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. diff --git a/debian/rules b/debian/rules deleted file mode 100755 index 68d9a36..0000000 --- a/debian/rules +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/make -f - -%: - dh $@ --buildsystem R diff --git a/debian/source/format b/debian/source/format deleted file mode 100644 index 163aaf8..0000000 --- a/debian/source/format +++ /dev/null @@ -1 +0,0 @@ -3.0 (quilt) diff --git a/debian/watch b/debian/watch deleted file mode 100644 index 9248da7..0000000 --- a/debian/watch +++ /dev/null @@ -1,3 +0,0 @@ -version=4 -opts=downloadurlmangle=s?^(.*)\.\.?https:$1packages/release/data/annotation? \ -http://www.bioconductor.org/packages/release/data/annotation/html/GenomeInfoDbData.html .*/GenomeInfoDbData_(.*).tar.gz diff --git a/inst/scripts/updateGenomeInfoDbData.R b/inst/scripts/updateGenomeInfoDbData.R new file mode 100644 index 0000000..8cfcca8 --- /dev/null +++ b/inst/scripts/updateGenomeInfoDbData.R @@ -0,0 +1,65 @@ +## Scripts for updating specData, speciesMap and validTaxId + +## Download and unpack mapping file: +## ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz + +## Generates specData +.processTaxNamesFile <- function(filesDir=getwd()){ +## species <- read.delim('names.dmp',header = FALSE,sep = "|") + dest <- file.path(filesDir, "names.dmp") + data <- read.delim(dest, header=FALSE, sep="\t", quote="", + stringsAsFactors=FALSE) + species <- data[,seq(1, dim(data)[2], by=2)] ## Throw away 'pipe columns' + colnames(species) <- c('tax_id','name_txt','unique_name','name_class') + ## keep only some cols + species <- species[,c(1:2,4)] + ## throw away tabs from second col + species[[2]] <- gsub('\t','',species[[2]]) + ## And the third col + species[[3]] <- gsub('\t','',species[[3]]) + ## throw away rows where the third column doesn't say 'scientific name' + keep <- grepl('scientific name', species[[3]]) + species <- species[keep,1:2] + + ## split second column by first space: + rawSpec <- species[[2]] + spltSpec <- strsplit(rawSpec, split=" ") + genusDat <- unlist(lapply(spltSpec, function(x){x[1]})) + .getRest <- function(x){ + if(length(x) > 1){ + return(paste(x[2:length(x)], collapse=" ")) + }else{ + return(NA) + } + } + speciesDat <- unlist(lapply(spltSpec, .getRest)) + specData <- data.frame(tax_id=as.integer(species[[1]]), ## integer + genus=as.factor(genusDat), ## factor + species=speciesDat, ## character + stringsAsFactors=FALSE) + save(specData, file='specData.rda', compress="xz") +} + +## Generates speciesMap and validTaxIds +.processSpeciesMapData <- function(){ + con <- file('names.dmp') + species <- readLines(con) + close(con) + splt <- strsplit(species, split='\\t\\|\\t') + ## Throw away elements where column 4 is not 'scientific name' or 'synonym' + idx1 <- unlist(lapply(splt, function(x){grepl('scientific name', x[4])})) + idx2 <- unlist(lapply(splt, function(x){grepl('synonym', x[4])})) + idx <- idx1 | idx2 + splt <- splt[idx] + ## and keep only 1st two elements + taxon <- as.integer(unlist(lapply(splt, function(x){x[1]}))) + species <- unlist(lapply(splt, function(x){x[2]})) + speciesMap <- data.frame(taxon, ## integer + species, ## character + stringsAsFactors=FALSE) + save(speciesMap, file='speciesMap.rda', compress="xz") + + ## Then get the valid Tax IDs. + validTaxIds <- unique(speciesMap$taxon) ## integer + save(validTaxIds, file='validTaxIds.rda', compress="xz") +} diff --git a/man/GenomeInfoDbData-package.Rd b/man/GenomeInfoDbData-package.Rd new file mode 100644 index 0000000..c80a8a5 --- /dev/null +++ b/man/GenomeInfoDbData-package.Rd @@ -0,0 +1,43 @@ +\name{GenomeInfoDb-package} + +\alias{GenomeInfoDb-package} +\alias{speciesMap} +\alias{validTaxIds} +\alias{specData} + +\title{Species and taxonomy ID look up tables} + +\description{ + This package contains three mapping objects: + \itemize{ + \item speciesMap: A data frame with columns \sQuote{tax_id}, + \sQuote{genus}, and \sQuote{species}. Used to retrieve taxonomy + ID by species and returns list of available species. + \item validTaxIds: An integer vector of valid taxonomy IDs created from + \code{speciesMap}. Used internally for quick taxonomy ID look ups. + \item specData: A data frame with columns \sQuote{taxon} and + \sQuote{species}. Used internally to retrieve species by taxonomy ID. + } +} + +\details{ + Scripts to generate these files are in GenomeInfoDbData/inst/scripts. All + originate from the taxdummp download at + \url{ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz}. +} + +\usage{ +data(speciesMap) +data(validTaxIds) +data(specData) +} + +\examples{ +data(speciesMap) +sapply(speciesMap, class) # taxon species + # "integer" "character" +subset(speciesMap, species=="Homo sapiens")$taxon # [1] 9606 +} + +\keyword{datasets} +\author{Bioconductor Core Team} -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/r-bioc-genomeinfodbdata.git _______________________________________________ debian-med-commit mailing list [email protected] http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/debian-med-commit
