version 2.1.1

4547049a · Francesco Favero · Gabor Csardi · 28dcfdc8 · 4547049a · 4547049a
Commit 4547049a authored 10 years ago by Francesco Favero Committed by Gabor Csardi 10 years ago
--- a/DESCRIPTION
+++ b/DESCRIPTION
 Package: sequenza
-Title: Copy number estimation from tumor genome sequencing data.
-Description: This package provides tools to analyze genomic sequencing data from
+Title: Copy Number Estimation from Tumor Genome Sequencing Data
+Description: Tools to analyze genomic sequencing data from
 paired normal-tumor samples, including cellularity and ploidy estimation; mutation
 and copy number (allele-specific and total copy number) detection, quantification 
 and visualization.
-Version: 2.1.0
-Date: 2014-10-07
+Version: 2.1.1
+Date: 2015-01-20
 Author: Francesco Favero, Andrea M. Marquard, Tejal Joshi, Aron C. Eklund
 Maintainer: Francesco Favero <favero@cbs.dtu.dk>
 Depends: R (>= 3.0.0), copynumber, parallel, squash
@@ -13,7 +13,7 @@ Suggests:
 License: GPL-3
 URL: http://cbs.dtu.dk/biotools/sequenza/, Mailing list:
        https://groups.google.com/forum/#!forum/sequenza-user-group
-Packaged: 2014-10-08 14:25:00 UTC; favero
+Packaged: 2015-01-20 04:38:23 UTC; favero
 NeedsCompilation: no
 Repository: CRAN
-Date/Publication: 2014-10-08 17:22:18
+Date/Publication: 2015-01-27 16:55:55
--- a/MD5
+++ b/MD5
-b354c6c06f7a0b447b6d0f780ab0b012 *DESCRIPTION
+910f6c45e9e62ddf06de4d13bda4979e *DESCRIPTION
 6d09bc56bb99e294c2ec6382e914b2fd *NAMESPACE
-231d84899a494f11ec757d7d1e969b5e *NEWS
+2006f46b799bcf41bd5236f6a78f801c *NEWS
 aeda5901d16339bb5d61ee434422e86d *R/analysis.R
 47d5566ca1686c3dabde59008b84f989 *R/bayes.R
 592d7b6911d9d06e18b388eed8cbcea3 *R/graphics.R
 d4080194f646974cb4f3c0089378f6de *R/misc.R
 4020afb3d3685ecdc0ffa459ff213c2d *R/model.R
-97c0fa67caa5c8ee0376e599248546b6 *R/next.R
+e52bd46020494d28003d6615c41d1c0e *R/next.R
 4c2d13a41cc82e15cdc73cd9d0c25e2a *R/workflows.R
 dc22c9077dd9a1916c9062e69180a45e *build/vignette.rds
 33cb86ef72ebdd33f91839241f0bd890 *data/CP.example.RData
 f865934a3a177ac65318e881f0c0c3d3 *data/example.seqz.rda
 a13b971aa94a30d151176acb0bccf441 *data/example.seqz.txt.gz
-914fe0d8e5903edd47f5dab5d18c2c57 *exec/sequenza-utils.py
-d3e4e9be3b08fd3e40977f4f8e1aff9e *inst/CITATION
+473d6d18140e030b741d8e3685c271b3 *exec/sequenza-utils.py
+49a67dbefc97a501bc47492ca1108e0a *inst/CITATION
 e8a13df2363bce7b30d929083c7b40d7 *inst/doc/sequenza.R
-bb6d23f2d0984177ffdc53a31782a093 *inst/doc/sequenza.Rnw
-1da123b3bb96413596faca66349aa5cb *inst/doc/sequenza.pdf
+4120d5d3657936c411c952be04c7f031 *inst/doc/sequenza.Rnw
+4f7362a072270cae8a0ec89a2c59f342 *inst/doc/sequenza.pdf
 a665959a09431961a03bfca5d9e6887c *man/CP.data.Rd
-191710987d5ced5a9d432031d4f856f9 *man/VarScan2seqz.Rd
+2443f587e9f9624836a0aece9d24acaf *man/VarScan2seqz.Rd
 75446b25323a9826ec51350fee71236c *man/baf.bayes.Rd
 618a75158dae6ca2efbb7742e8068499 *man/baf.model.fit.Rd
 bd71f7319858ffdfa6f64301dd95d0f0 *man/chromosome.view.Rd
@@ -35,5 +35,5 @@ c5e45e6bc6f7b4eaadc160f0d28a72ab *man/theoretical.baf.Rd
 f2abe359b28581ac7f261a6185a69efb *man/types.matrix.Rd
 8a6379846fcba08794e7ecf7dcf50eb6 *man/windowValues.Rd
 2f9ad02f2ec9c382394dff14ce62af62 *man/workflows.Rd
-bb6d23f2d0984177ffdc53a31782a093 *vignettes/sequenza.Rnw
+4120d5d3657936c411c952be04c7f031 *vignettes/sequenza.Rnw
 fbb2da96e9546ed5994885d9b2b51994 *vignettes/sequenza.bib
--- a/NEWS
+++ b/NEWS
+Changes in version 2.1.1 (2015-01-20)
+ - Fix heterozygous detection when importing VarScan2 data of very high coverage seq.
+ - Cleanup sequenza-utils.py
+ - Update citation info to published manuscript
+
 Changes in version 2.1.0 (2014-10-08)
 - Add sequenza-utils.py function bam2seqz
 - Add raw data depth-ratio/Bf in the genome view plots

--- a/R/next.R
+++ b/R/next.R
@@ -46,7 +46,7 @@ sequenza2PyClone <- function(mut.tab, seg.cn, sample.id, norm.cn = 2) {
  na.exclude(pyclone.tsv)
 }

-VarScan2seqz <- function(varscan.somatic, varscan.copynumber = NULL) {
+VarScan2seqz <- function(varscan.somatic, varscan.copynumber = NULL, normal_var_freq = 0.25) {
   
   iupac.nucs     <- setNames(c('A', 'C', 'G', 'GT', 'AC', 'AG', 'CG', 'T', 'AT', 'CT'),
                              c('A', 'C', 'G', 'K', 'M', 'R', 'S', 'T', 'W', 'Y'))
@@ -56,11 +56,19 @@ VarScan2seqz <- function(varscan.somatic, varscan.copynumber = NULL) {
   varscan.somatic$normal_var_freq <- as.numeric(sub('%', '', varscan.somatic$normal_var_freq)) / 100
   varscan.somatic$tumor_var_freq  <- as.numeric(sub('%', '', varscan.somatic$tumor_var_freq)) / 100
   zygosity.normal <- zygosity.vect[varscan.somatic$normal_gt]
-   AB.normal  <- iupac.nucs[varscan.somatic$normal_gt]
-   AB.tumor    <- rep('.', length(AB.normal))
-   strand      <- AB.tumor
+   if (normal_var_freq > 0.5){
+      normal_var_freq <- 1 - normal_var_freq
+   }
+   idx             <- which(zygosity.normal == "het" &
+                   (normal_var_freq >= varscan.somatic$normal_var_freq |
+                   varscan.somatic$normal_var_freq >= (1-normal_var_freq)))
+   varscan.somatic <- varscan.somatic[-idx, ]
+   zygosity.normal <- zygosity.vect[varscan.somatic$normal_gt]
+   AB.normal    <- iupac.nucs[varscan.somatic$normal_gt]
+   AB.tumor     <- rep('.', length(AB.normal))
+   strand       <- AB.tumor
   depth.normal <- varscan.somatic$normal_reads1 + varscan.somatic$normal_reads2
-   depth.tumor <- varscan.somatic$tumor_reads1 + varscan.somatic$tumor_reads1
+   depth.tumor  <- varscan.somatic$tumor_reads1 + varscan.somatic$tumor_reads1
   depth.ratio  <- depth.tumor/depth.normal
   Af <- 1 - varscan.somatic$tumor_var_freq
   Bf <- rep(0, length(Af))

--- a/exec/sequenza-utils.py
+++ b/exec/sequenza-utils.py
@@ -16,7 +16,7 @@ from subprocess import Popen, check_call, PIPE
 from tempfile   import mkdtemp

 VERSION = "2.1.0"
-DATE    = "05 September 2014"
+DATE    = "07 October 2014"
 AUTHOR  = "Favero Francesco"
 MAIL    = "favero@cbs.dtu.dk"

@@ -636,7 +636,7 @@ class DefaultHelpParser(argparse.ArgumentParser):
        sys.exit(2)


-def DOpup2seqz(p1, p2, gc, n2, n, qlimit, qformat, hom, het, nproc, chunk, fileout, out_header):
+def DOpup2seqz(p1, p2, gc, n2, n, qlimit, qformat, hom, het, fileout, out_header):
   stream_mpileup = IterableQueue()
   if not n2:
      line_worker_partial = partial(line_worker, depth_sum=n, qlimit=qlimit, qformat=qformat, hom_t=hom, het_t=het)
@@ -644,16 +644,10 @@ def DOpup2seqz(p1, p2, gc, n2, n, qlimit, qformat, hom, het, nproc, chunk, fileo
         pup = multiPileups(normal,tumor)
         pup = GCmultiPileups(pup, gc_file)
         fileout.write("\t".join(out_header) + '\n')
-         if nproc > 0:
-            p = multiprocessing.Pool(processes=nproc)
-            for res in p.imap(line_worker_partial, pup,chunksize=chunk):
-               if res:
-                  fileout.write('\t'.join(map(str,res))+'\n')
-         else:
-            for line in pup:
-               res = line_worker_partial(line)
-               if res:
-                  fileout.write('\t'.join(map(str,res))+'\n')
+         for line in pup:
+            res = line_worker_partial(line)
+            if res:
+               fileout.write('\t'.join(map(str,res))+'\n')
   else:
      line_worker_partial = partial(line_worker, depth_sum=n, qlimit=qlimit, qformat=qformat, hom_t=hom, het_t=het, alt_pileup=True) 
      with xopen(p1, 'rb') as normal, xopen(p2, 'rb') as tumor, xopen(gc, 'rb') as gc_file, xopen(n2, 'rb') as alt_normal:
@@ -661,16 +655,10 @@ def DOpup2seqz(p1, p2, gc, n2, n, qlimit, qformat, hom, het, nproc, chunk, fileo
         pup = GCmultiPileups(pup, gc_file)
         pup = GCmultiPileupsAltDepth(pup, alt_normal)
         fileout.write("\t".join(out_header) + '\n')
-         if nproc > 0:
-            p = multiprocessing.Pool(processes=nproc)
-            for res in p.imap(line_worker_partial, pup,chunksize=chunk):
-               if res:
-                  fileout.write('\t'.join(map(str,res))+'\n')
-         else:
-            for line in pup:
-               res = line_worker_partial(line)
-               if res:
-                  fileout.write('\t'.join(map(str,res))+'\n')
+         for line in pup:
+            res = line_worker_partial(line)
+            if res:
+               fileout.write('\t'.join(map(str,res))+'\n')


 def pileup2acgt(parser, subparser):
@@ -683,11 +671,6 @@ def pileup2acgt(parser, subparser):
                       help='Name of the output file. To use gzip compression name the file ending in .gz. (default STDOUT).')
   parser_pup2muoutput.add_argument('--quiet', dest='quiet', action="store_true",
                       help='Do not output additional debugging information.')
-   parser_pup2muperformance = subparser.add_argument_group(title='Performance', description='Arguments that can effect the performance.')
-   parser_pup2muperformance.add_argument('-p', '--processes', dest='nproc', default=0, type=int,
-                   help='Set the number of processes to split the parsing job. If set to 0 (default), the job will occur with no forking to other processes.')
-   parser_pup2muperformance.add_argument('-c', '--chunk', dest='chunk', default=1000, type=int,
-                   help='Set the number of input lines to assign to each process, if NPROC > 0. (Default = 1000)')
   parser_pup2muqualitysets = subparser.add_argument_group(title='Quality and Format', description='Argument that change the quality threshold or the quality format.')
   parser_pup2muqualitysets.add_argument('-q', '--qlimit', dest='qlimit', default=20,type=int,
                   help='Minimum nucleotide quality score for inclusion in the counts.')
@@ -701,7 +684,6 @@ def pileup2acgt(parser, subparser):
 def pileup2seqz(parser, subparser):
   parser_ABinput    = subparser.add_argument_group(title='Input Files',description='Required input files.')
   parser_ABgenotype    = subparser.add_argument_group(title='Genotyper',description='Options regarding the genotyping.')
-   parser_ABperformance = subparser.add_argument_group(title='Performance', description='Options affecting the performance.')
   parser_ABqualitysets = subparser.add_argument_group(title='Quality and Format', description='Options that change the quality threshold and format.')
   parser_ABinput.add_argument('-n', '--normal', dest = 'normal', required = True,
                   help='Name of the pileup file from the reference/normal sample')
@@ -721,10 +703,6 @@ def pileup2seqz(parser, subparser):
                   help='Threshold to select homozygous positions. Default 0.9.')
   parser_ABgenotype.add_argument('--het', dest = 'het', type = float, default = 0.25,
                   help='Threshold to select heterozygous positions. Default 0.25.')
-   parser_ABperformance.add_argument('-p', '--processes', dest='nproc', default=0, type=int,
-                   help='Set the number of processes to split the parsing job. If set to 0 (default), the job will occur with no forking to other processes.')
-   parser_ABperformance.add_argument('-c', '--chunk', dest='chunk', default=1000, type=int,
-                   help='Set the number of input lines to assign to each process, if NPROC > 0. (Default = 1000)')
   return parser.parse_args()

 def bam2seqz(parser, subparser):
@@ -755,7 +733,7 @@ def bam2seqz(parser, subparser):
   parser_ABsamtools.add_argument("-S", '--samtools', dest = 'samtools', type = str, default = "samtools",
                   help='Path of samtools to use for the pileup generation.')
   parser_ABsamtools.add_argument("-C", '--chromosome', dest = 'chr', type = str, default = None,
-                   help='Argument to restrict the input/output to a chromosome or a chromosome region. Coordinate format is Name:pos.start-pos.end, eg: chr17:7565097-7590856, for a particular region; eg: chr17, for the entire chromosome. Chromosome name are depending of the BAM file and FASTA reference used for alignment. Default behaviour is not selecting any cromosome.')
+                   help='Argument to restrict the input/output to a chromosome or a chromosome region. Coordinate format is Name:pos.start-pos.end, eg: chr17:7565097-7590856, for a particular region; eg: chr17, for the entire chromosome. Chromosome names can checked in the BAM files and are depending on the FASTA reference used for alignment. Default behaviour is to not selecting any cromosome.')
   return parser.parse_args()

 def GC_windows(parser, subparser):
@@ -798,44 +776,15 @@ def main():
      used_module =  sys.argv[1]
      if used_module == "pileup2acgt":
         args = pileup2acgt(parser, parser_pup2mu)
-         if args.chunk == 0:
-            args.chunk = 1
-            args.nproc  = 0
-         if args.nproc >= 1:
-            p = multiprocessing.Pool(processes=args.nproc)
-         if not args.quiet:
-            logging.basicConfig(format='%(message)s')
-            start = time.clock()
-            if args.pileup != '-':
-               file_size = (os.stat(args.pileup).st_size/(1024*1024))
-               logging.warning("Converting " + args.pileup + " -- size = %0.1f MB --" % file_size + " to ACGT..." )
-            else:
-               logging.warning("Converting " + args.pileup + " from STDIN to ACGT..." )
-            logging.warning("Using chunks of " + str(args.chunk) + " line(s), and splitting the job into " + str(args.nproc+1)  + " process(es).")
-         with xopen(args.output, "wb") as fileout:
-            with xopen(args.pileup, "rb") as f:
-               fileout.write('chr' + "\t" + 'n_base' + "\t" + 'ref_base' + "\t" +  'read.depth' + "\t" + 'A' + "\t" + 'C' + "\t" + 'G' + "\t" + 'T' + "\t" + "strand" + '\n')
-               parse_pileup_partial = partial(parse_pileup_str, min_depth=args.n, qlimit=args.qlimit, qformat=args.qformat)
-               counter = 0
-               for chunk in grouper(args.chunk, f):
-                  if args.nproc >= 1:
-                     try:
-                        results = p.map_async(parse_pileup_partial, chunk).get(99)
-                     except AttributeError:
-                        pass
-                  else:
-                     try:
-                        results = map(parse_pileup_partial, chunk)
-                     except AttributeError:
-                        pass
-                  for r in results:
-                     counter = counter + 1
-                     if r:
-                        fileout.write(r + '\n')
-            if not args.quiet:
-               end = time.clock()
-               seconds =  end-start
-               logging.warning("Pileup to ACGT: processed " + str(counter) + " lines in " + str(seconds) + " seconds")
+
+         with xopen(args.output, "wb") as fileout, xopen(args.pileup, "rb") as f:
+            fileout.write('chr' + "\t" + 'n_base' + "\t" + 'ref_base' + "\t" +  'read.depth' + "\t" + 'A' + "\t" + 'C' + "\t" + 'G' + "\t" + 'T' + "\t" + "strand" + '\n')
+            parse_pileup_partial = partial(parse_pileup_str, min_depth=args.n, qlimit=args.qlimit, qformat=args.qformat)
+            for line in f:
+               try:
+                  fileout.write(parse_pileup_partial(line) + '\n')
+               except AttributeError:
+                  pass                        

      elif used_module == "bam2seqz":
         args = bam2seqz(parser, parser_bam2seqz)
@@ -860,7 +809,7 @@ def main():
               nor1 = Popen(cmd_nor1, stdout = PIPE)
               nor2 = Popen(cmd_nor3, stdout = PIPE)
               with named_pipe() as tfifo, named_pipe() as nfifo, named_pipe() as n2fifo:
-                  res = multiprocessing.Process(target = DOpup2seqz, args = (nfifo, tfifo, args.gc, n2fifo, args.n,  args.qlimit, args.qformat, args.hom, args.het, 0, 1, fileout, out_header))
+                  res = multiprocessing.Process(target = DOpup2seqz, args = (nfifo, tfifo, args.gc, n2fifo, args.n,  args.qlimit, args.qformat, args.hom, args.het, fileout, out_header))
                  res.start()
                  with open(nfifo, 'wb', 0) as normal, open(tfifo, 'wb', 0) as tumor, open(n2fifo, 'wb', 0) as normal2:
                     fifos = [Popen(cmd_tum2, stdin=tum1.stdout, stdout=tumor, stderr=PIPE), Popen(cmd_nor2, stdin=nor1.stdout, stdout=normal, stderr=PIPE), Popen(cmd_nor4, stdin=nor2.stdout, stdout=normal2, stderr=PIPE)]
@@ -874,7 +823,7 @@ def main():
               tum1 = Popen(cmd_tum1, stdout = PIPE)
               nor1 = Popen(cmd_nor1, stdout = PIPE)
               with named_pipe() as tfifo, named_pipe() as nfifo:
-                  res = multiprocessing.Process(target = DOpup2seqz, args = (nfifo, tfifo, args.gc, args.normal2, args.n,  args.qlimit, args.qformat, args.hom, args.het, 0, 1, fileout, out_header))
+                  res = multiprocessing.Process(target = DOpup2seqz, args = (nfifo, tfifo, args.gc, args.normal2, args.n,  args.qlimit, args.qformat, args.hom, args.het, fileout, out_header))
                  res.start()
                  with open(nfifo, 'wb', 0) as normal, open(tfifo, 'wb', 0) as tumor:
                     fifos = [Popen(cmd_tum2, stdin=tum1.stdout, stdout=tumor, stderr=PIPE), Popen(cmd_nor2, stdin=nor1.stdout, stdout=normal, stderr=PIPE)]
@@ -888,7 +837,7 @@ def main():
         args = pileup2seqz(parser, parser_pileup2seqz)
         with xopen('-', "wb") as fileout:
            out_header = ["chromosome", "position", "base.ref", "depth.normal", "depth.tumor", "depth.ratio", "Af", "Bf", "zygosity.normal", "GC.percent", "good.reads", "AB.normal", "AB.tumor", "tumor.strand"]
-            res = multiprocessing.Process(target = DOpup2seqz, args = (args.normal, args.tumor, args.gc, args.normal2, args.n,  args.qlimit, args.qformat, args.hom, args.het, args.nproc, args.chunk, fileout, out_header))
+            res = multiprocessing.Process(target = DOpup2seqz, args = (args.normal, args.tumor, args.gc, args.normal2, args.n,  args.qlimit, args.qformat, args.hom, args.het, fileout, out_header))
            res.start()
         res.join()


--- a/inst/CITATION
+++ b/inst/CITATION
@@ -10,6 +10,11 @@ bibentry(bibtype = 'Article',
                            as.person("Qiyuan Li"),
                            as.person("Zoltan Szallasi"),
                            as.person("Aron C Eklund")),
-        journal = "submitted for publication",
-        year = 2014
+        journal = "Annals of Oncology",
+        year = 2015,
+        volume = 26,
+        issue = 1,
+        pages = "64-70",
+        doi = "10.1093/annonc/mdu479",
+        url = "http://annonc.oxfordjournals.org/content/26/1/64"
 )
--- a/inst/doc/sequenza.Rnw
+++ b/inst/doc/sequenza.Rnw
@@ -7,9 +7,9 @@

 \documentclass[10pt,a4paper]{article}
 %\usepackage{a4wide}
-\usepackage{geometry}
+%\usepackage{geometry}
 \usepackage{hyperref}
-\usepackage{subfigure}
+%\usepackage{subfigure}
 \usepackage{listings}
 \usepackage{float}
 \lstdefinestyle{BashInputStyle}{

--- a/inst/doc/sequenza.pdf
+++ b/inst/doc/sequenza.pdf
--- a/man/VarScan2seqz.Rd
+++ b/man/VarScan2seqz.Rd
@@ -7,12 +7,13 @@
 }

 \usage{
-  VarScan2seqz(varscan.somatic, varscan.copynumber = NULL)
+  VarScan2seqz(varscan.somatic, varscan.copynumber = NULL, normal_var_freq = 0.25)
 }

 \arguments{
  \item{varscan.somatic}{a data frame from the output file of \command{somatic}.}
  \item{varscan.copynumber}{a data frame from the output file of \command{copynumber} (optional).}
+  \item{normal_var_freq}{defines the variant allele frequency threshold on the normal genotype to filter out false positive heterozygous position.}
 }

 \details{

--- a/vignettes/sequenza.Rnw
+++ b/vignettes/sequenza.Rnw
@@ -7,9 +7,9 @@

 \documentclass[10pt,a4paper]{article}
 %\usepackage{a4wide}
-\usepackage{geometry}
+%\usepackage{geometry}
 \usepackage{hyperref}
-\usepackage{subfigure}
+%\usepackage{subfigure}
 \usepackage{listings}
 \usepackage{float}
 \lstdefinestyle{BashInputStyle}{