Skip to content
Snippets Groups Projects
Commit 4547049a authored by Francesco Favero's avatar Francesco Favero Committed by Gabor Csardi
Browse files

version 2.1.1

parent 28dcfdc8
No related branches found
No related tags found
No related merge requests found
Package: sequenza
Title: Copy number estimation from tumor genome sequencing data.
Description: This package provides tools to analyze genomic sequencing data from
Title: Copy Number Estimation from Tumor Genome Sequencing Data
Description: Tools to analyze genomic sequencing data from
paired normal-tumor samples, including cellularity and ploidy estimation; mutation
and copy number (allele-specific and total copy number) detection, quantification
and visualization.
Version: 2.1.0
Date: 2014-10-07
Version: 2.1.1
Date: 2015-01-20
Author: Francesco Favero, Andrea M. Marquard, Tejal Joshi, Aron C. Eklund
Maintainer: Francesco Favero <favero@cbs.dtu.dk>
Depends: R (>= 3.0.0), copynumber, parallel, squash
......@@ -13,7 +13,7 @@ Suggests:
License: GPL-3
URL: http://cbs.dtu.dk/biotools/sequenza/, Mailing list:
https://groups.google.com/forum/#!forum/sequenza-user-group
Packaged: 2014-10-08 14:25:00 UTC; favero
Packaged: 2015-01-20 04:38:23 UTC; favero
NeedsCompilation: no
Repository: CRAN
Date/Publication: 2014-10-08 17:22:18
Date/Publication: 2015-01-27 16:55:55
b354c6c06f7a0b447b6d0f780ab0b012 *DESCRIPTION
910f6c45e9e62ddf06de4d13bda4979e *DESCRIPTION
6d09bc56bb99e294c2ec6382e914b2fd *NAMESPACE
231d84899a494f11ec757d7d1e969b5e *NEWS
2006f46b799bcf41bd5236f6a78f801c *NEWS
aeda5901d16339bb5d61ee434422e86d *R/analysis.R
47d5566ca1686c3dabde59008b84f989 *R/bayes.R
592d7b6911d9d06e18b388eed8cbcea3 *R/graphics.R
d4080194f646974cb4f3c0089378f6de *R/misc.R
4020afb3d3685ecdc0ffa459ff213c2d *R/model.R
97c0fa67caa5c8ee0376e599248546b6 *R/next.R
e52bd46020494d28003d6615c41d1c0e *R/next.R
4c2d13a41cc82e15cdc73cd9d0c25e2a *R/workflows.R
dc22c9077dd9a1916c9062e69180a45e *build/vignette.rds
33cb86ef72ebdd33f91839241f0bd890 *data/CP.example.RData
f865934a3a177ac65318e881f0c0c3d3 *data/example.seqz.rda
a13b971aa94a30d151176acb0bccf441 *data/example.seqz.txt.gz
914fe0d8e5903edd47f5dab5d18c2c57 *exec/sequenza-utils.py
d3e4e9be3b08fd3e40977f4f8e1aff9e *inst/CITATION
473d6d18140e030b741d8e3685c271b3 *exec/sequenza-utils.py
49a67dbefc97a501bc47492ca1108e0a *inst/CITATION
e8a13df2363bce7b30d929083c7b40d7 *inst/doc/sequenza.R
bb6d23f2d0984177ffdc53a31782a093 *inst/doc/sequenza.Rnw
1da123b3bb96413596faca66349aa5cb *inst/doc/sequenza.pdf
4120d5d3657936c411c952be04c7f031 *inst/doc/sequenza.Rnw
4f7362a072270cae8a0ec89a2c59f342 *inst/doc/sequenza.pdf
a665959a09431961a03bfca5d9e6887c *man/CP.data.Rd
191710987d5ced5a9d432031d4f856f9 *man/VarScan2seqz.Rd
2443f587e9f9624836a0aece9d24acaf *man/VarScan2seqz.Rd
75446b25323a9826ec51350fee71236c *man/baf.bayes.Rd
618a75158dae6ca2efbb7742e8068499 *man/baf.model.fit.Rd
bd71f7319858ffdfa6f64301dd95d0f0 *man/chromosome.view.Rd
......@@ -35,5 +35,5 @@ c5e45e6bc6f7b4eaadc160f0d28a72ab *man/theoretical.baf.Rd
f2abe359b28581ac7f261a6185a69efb *man/types.matrix.Rd
8a6379846fcba08794e7ecf7dcf50eb6 *man/windowValues.Rd
2f9ad02f2ec9c382394dff14ce62af62 *man/workflows.Rd
bb6d23f2d0984177ffdc53a31782a093 *vignettes/sequenza.Rnw
4120d5d3657936c411c952be04c7f031 *vignettes/sequenza.Rnw
fbb2da96e9546ed5994885d9b2b51994 *vignettes/sequenza.bib
Changes in version 2.1.1 (2015-01-20)
- Fix heterozygous detection when importing VarScan2 data of very high coverage seq.
- Cleanup sequenza-utils.py
- Update citation info to published manuscript
Changes in version 2.1.0 (2014-10-08)
- Add sequenza-utils.py function bam2seqz
- Add raw data depth-ratio/Bf in the genome view plots
......
......@@ -46,7 +46,7 @@ sequenza2PyClone <- function(mut.tab, seg.cn, sample.id, norm.cn = 2) {
na.exclude(pyclone.tsv)
}
VarScan2seqz <- function(varscan.somatic, varscan.copynumber = NULL) {
VarScan2seqz <- function(varscan.somatic, varscan.copynumber = NULL, normal_var_freq = 0.25) {
iupac.nucs <- setNames(c('A', 'C', 'G', 'GT', 'AC', 'AG', 'CG', 'T', 'AT', 'CT'),
c('A', 'C', 'G', 'K', 'M', 'R', 'S', 'T', 'W', 'Y'))
......@@ -56,11 +56,19 @@ VarScan2seqz <- function(varscan.somatic, varscan.copynumber = NULL) {
varscan.somatic$normal_var_freq <- as.numeric(sub('%', '', varscan.somatic$normal_var_freq)) / 100
varscan.somatic$tumor_var_freq <- as.numeric(sub('%', '', varscan.somatic$tumor_var_freq)) / 100
zygosity.normal <- zygosity.vect[varscan.somatic$normal_gt]
AB.normal <- iupac.nucs[varscan.somatic$normal_gt]
AB.tumor <- rep('.', length(AB.normal))
strand <- AB.tumor
if (normal_var_freq > 0.5){
normal_var_freq <- 1 - normal_var_freq
}
idx <- which(zygosity.normal == "het" &
(normal_var_freq >= varscan.somatic$normal_var_freq |
varscan.somatic$normal_var_freq >= (1-normal_var_freq)))
varscan.somatic <- varscan.somatic[-idx, ]
zygosity.normal <- zygosity.vect[varscan.somatic$normal_gt]
AB.normal <- iupac.nucs[varscan.somatic$normal_gt]
AB.tumor <- rep('.', length(AB.normal))
strand <- AB.tumor
depth.normal <- varscan.somatic$normal_reads1 + varscan.somatic$normal_reads2
depth.tumor <- varscan.somatic$tumor_reads1 + varscan.somatic$tumor_reads1
depth.tumor <- varscan.somatic$tumor_reads1 + varscan.somatic$tumor_reads1
depth.ratio <- depth.tumor/depth.normal
Af <- 1 - varscan.somatic$tumor_var_freq
Bf <- rep(0, length(Af))
......
......@@ -16,7 +16,7 @@ from subprocess import Popen, check_call, PIPE
from tempfile import mkdtemp
VERSION = "2.1.0"
DATE = "05 September 2014"
DATE = "07 October 2014"
AUTHOR = "Favero Francesco"
MAIL = "favero@cbs.dtu.dk"
......@@ -636,7 +636,7 @@ class DefaultHelpParser(argparse.ArgumentParser):
sys.exit(2)
def DOpup2seqz(p1, p2, gc, n2, n, qlimit, qformat, hom, het, nproc, chunk, fileout, out_header):
def DOpup2seqz(p1, p2, gc, n2, n, qlimit, qformat, hom, het, fileout, out_header):
stream_mpileup = IterableQueue()
if not n2:
line_worker_partial = partial(line_worker, depth_sum=n, qlimit=qlimit, qformat=qformat, hom_t=hom, het_t=het)
......@@ -644,16 +644,10 @@ def DOpup2seqz(p1, p2, gc, n2, n, qlimit, qformat, hom, het, nproc, chunk, fileo
pup = multiPileups(normal,tumor)
pup = GCmultiPileups(pup, gc_file)
fileout.write("\t".join(out_header) + '\n')
if nproc > 0:
p = multiprocessing.Pool(processes=nproc)
for res in p.imap(line_worker_partial, pup,chunksize=chunk):
if res:
fileout.write('\t'.join(map(str,res))+'\n')
else:
for line in pup:
res = line_worker_partial(line)
if res:
fileout.write('\t'.join(map(str,res))+'\n')
for line in pup:
res = line_worker_partial(line)
if res:
fileout.write('\t'.join(map(str,res))+'\n')
else:
line_worker_partial = partial(line_worker, depth_sum=n, qlimit=qlimit, qformat=qformat, hom_t=hom, het_t=het, alt_pileup=True)
with xopen(p1, 'rb') as normal, xopen(p2, 'rb') as tumor, xopen(gc, 'rb') as gc_file, xopen(n2, 'rb') as alt_normal:
......@@ -661,16 +655,10 @@ def DOpup2seqz(p1, p2, gc, n2, n, qlimit, qformat, hom, het, nproc, chunk, fileo
pup = GCmultiPileups(pup, gc_file)
pup = GCmultiPileupsAltDepth(pup, alt_normal)
fileout.write("\t".join(out_header) + '\n')
if nproc > 0:
p = multiprocessing.Pool(processes=nproc)
for res in p.imap(line_worker_partial, pup,chunksize=chunk):
if res:
fileout.write('\t'.join(map(str,res))+'\n')
else:
for line in pup:
res = line_worker_partial(line)
if res:
fileout.write('\t'.join(map(str,res))+'\n')
for line in pup:
res = line_worker_partial(line)
if res:
fileout.write('\t'.join(map(str,res))+'\n')
def pileup2acgt(parser, subparser):
......@@ -683,11 +671,6 @@ def pileup2acgt(parser, subparser):
help='Name of the output file. To use gzip compression name the file ending in .gz. (default STDOUT).')
parser_pup2muoutput.add_argument('--quiet', dest='quiet', action="store_true",
help='Do not output additional debugging information.')
parser_pup2muperformance = subparser.add_argument_group(title='Performance', description='Arguments that can effect the performance.')
parser_pup2muperformance.add_argument('-p', '--processes', dest='nproc', default=0, type=int,
help='Set the number of processes to split the parsing job. If set to 0 (default), the job will occur with no forking to other processes.')
parser_pup2muperformance.add_argument('-c', '--chunk', dest='chunk', default=1000, type=int,
help='Set the number of input lines to assign to each process, if NPROC > 0. (Default = 1000)')
parser_pup2muqualitysets = subparser.add_argument_group(title='Quality and Format', description='Argument that change the quality threshold or the quality format.')
parser_pup2muqualitysets.add_argument('-q', '--qlimit', dest='qlimit', default=20,type=int,
help='Minimum nucleotide quality score for inclusion in the counts.')
......@@ -701,7 +684,6 @@ def pileup2acgt(parser, subparser):
def pileup2seqz(parser, subparser):
parser_ABinput = subparser.add_argument_group(title='Input Files',description='Required input files.')
parser_ABgenotype = subparser.add_argument_group(title='Genotyper',description='Options regarding the genotyping.')
parser_ABperformance = subparser.add_argument_group(title='Performance', description='Options affecting the performance.')
parser_ABqualitysets = subparser.add_argument_group(title='Quality and Format', description='Options that change the quality threshold and format.')
parser_ABinput.add_argument('-n', '--normal', dest = 'normal', required = True,
help='Name of the pileup file from the reference/normal sample')
......@@ -721,10 +703,6 @@ def pileup2seqz(parser, subparser):
help='Threshold to select homozygous positions. Default 0.9.')
parser_ABgenotype.add_argument('--het', dest = 'het', type = float, default = 0.25,
help='Threshold to select heterozygous positions. Default 0.25.')
parser_ABperformance.add_argument('-p', '--processes', dest='nproc', default=0, type=int,
help='Set the number of processes to split the parsing job. If set to 0 (default), the job will occur with no forking to other processes.')
parser_ABperformance.add_argument('-c', '--chunk', dest='chunk', default=1000, type=int,
help='Set the number of input lines to assign to each process, if NPROC > 0. (Default = 1000)')
return parser.parse_args()
def bam2seqz(parser, subparser):
......@@ -755,7 +733,7 @@ def bam2seqz(parser, subparser):
parser_ABsamtools.add_argument("-S", '--samtools', dest = 'samtools', type = str, default = "samtools",
help='Path of samtools to use for the pileup generation.')
parser_ABsamtools.add_argument("-C", '--chromosome', dest = 'chr', type = str, default = None,
help='Argument to restrict the input/output to a chromosome or a chromosome region. Coordinate format is Name:pos.start-pos.end, eg: chr17:7565097-7590856, for a particular region; eg: chr17, for the entire chromosome. Chromosome name are depending of the BAM file and FASTA reference used for alignment. Default behaviour is not selecting any cromosome.')
help='Argument to restrict the input/output to a chromosome or a chromosome region. Coordinate format is Name:pos.start-pos.end, eg: chr17:7565097-7590856, for a particular region; eg: chr17, for the entire chromosome. Chromosome names can checked in the BAM files and are depending on the FASTA reference used for alignment. Default behaviour is to not selecting any cromosome.')
return parser.parse_args()
def GC_windows(parser, subparser):
......@@ -798,44 +776,15 @@ def main():
used_module = sys.argv[1]
if used_module == "pileup2acgt":
args = pileup2acgt(parser, parser_pup2mu)
if args.chunk == 0:
args.chunk = 1
args.nproc = 0
if args.nproc >= 1:
p = multiprocessing.Pool(processes=args.nproc)
if not args.quiet:
logging.basicConfig(format='%(message)s')
start = time.clock()
if args.pileup != '-':
file_size = (os.stat(args.pileup).st_size/(1024*1024))
logging.warning("Converting " + args.pileup + " -- size = %0.1f MB --" % file_size + " to ACGT..." )
else:
logging.warning("Converting " + args.pileup + " from STDIN to ACGT..." )
logging.warning("Using chunks of " + str(args.chunk) + " line(s), and splitting the job into " + str(args.nproc+1) + " process(es).")
with xopen(args.output, "wb") as fileout:
with xopen(args.pileup, "rb") as f:
fileout.write('chr' + "\t" + 'n_base' + "\t" + 'ref_base' + "\t" + 'read.depth' + "\t" + 'A' + "\t" + 'C' + "\t" + 'G' + "\t" + 'T' + "\t" + "strand" + '\n')
parse_pileup_partial = partial(parse_pileup_str, min_depth=args.n, qlimit=args.qlimit, qformat=args.qformat)
counter = 0
for chunk in grouper(args.chunk, f):
if args.nproc >= 1:
try:
results = p.map_async(parse_pileup_partial, chunk).get(99)
except AttributeError:
pass
else:
try:
results = map(parse_pileup_partial, chunk)
except AttributeError:
pass
for r in results:
counter = counter + 1
if r:
fileout.write(r + '\n')
if not args.quiet:
end = time.clock()
seconds = end-start
logging.warning("Pileup to ACGT: processed " + str(counter) + " lines in " + str(seconds) + " seconds")
with xopen(args.output, "wb") as fileout, xopen(args.pileup, "rb") as f:
fileout.write('chr' + "\t" + 'n_base' + "\t" + 'ref_base' + "\t" + 'read.depth' + "\t" + 'A' + "\t" + 'C' + "\t" + 'G' + "\t" + 'T' + "\t" + "strand" + '\n')
parse_pileup_partial = partial(parse_pileup_str, min_depth=args.n, qlimit=args.qlimit, qformat=args.qformat)
for line in f:
try:
fileout.write(parse_pileup_partial(line) + '\n')
except AttributeError:
pass
elif used_module == "bam2seqz":
args = bam2seqz(parser, parser_bam2seqz)
......@@ -860,7 +809,7 @@ def main():
nor1 = Popen(cmd_nor1, stdout = PIPE)
nor2 = Popen(cmd_nor3, stdout = PIPE)
with named_pipe() as tfifo, named_pipe() as nfifo, named_pipe() as n2fifo:
res = multiprocessing.Process(target = DOpup2seqz, args = (nfifo, tfifo, args.gc, n2fifo, args.n, args.qlimit, args.qformat, args.hom, args.het, 0, 1, fileout, out_header))
res = multiprocessing.Process(target = DOpup2seqz, args = (nfifo, tfifo, args.gc, n2fifo, args.n, args.qlimit, args.qformat, args.hom, args.het, fileout, out_header))
res.start()
with open(nfifo, 'wb', 0) as normal, open(tfifo, 'wb', 0) as tumor, open(n2fifo, 'wb', 0) as normal2:
fifos = [Popen(cmd_tum2, stdin=tum1.stdout, stdout=tumor, stderr=PIPE), Popen(cmd_nor2, stdin=nor1.stdout, stdout=normal, stderr=PIPE), Popen(cmd_nor4, stdin=nor2.stdout, stdout=normal2, stderr=PIPE)]
......@@ -874,7 +823,7 @@ def main():
tum1 = Popen(cmd_tum1, stdout = PIPE)
nor1 = Popen(cmd_nor1, stdout = PIPE)
with named_pipe() as tfifo, named_pipe() as nfifo:
res = multiprocessing.Process(target = DOpup2seqz, args = (nfifo, tfifo, args.gc, args.normal2, args.n, args.qlimit, args.qformat, args.hom, args.het, 0, 1, fileout, out_header))
res = multiprocessing.Process(target = DOpup2seqz, args = (nfifo, tfifo, args.gc, args.normal2, args.n, args.qlimit, args.qformat, args.hom, args.het, fileout, out_header))
res.start()
with open(nfifo, 'wb', 0) as normal, open(tfifo, 'wb', 0) as tumor:
fifos = [Popen(cmd_tum2, stdin=tum1.stdout, stdout=tumor, stderr=PIPE), Popen(cmd_nor2, stdin=nor1.stdout, stdout=normal, stderr=PIPE)]
......@@ -888,7 +837,7 @@ def main():
args = pileup2seqz(parser, parser_pileup2seqz)
with xopen('-', "wb") as fileout:
out_header = ["chromosome", "position", "base.ref", "depth.normal", "depth.tumor", "depth.ratio", "Af", "Bf", "zygosity.normal", "GC.percent", "good.reads", "AB.normal", "AB.tumor", "tumor.strand"]
res = multiprocessing.Process(target = DOpup2seqz, args = (args.normal, args.tumor, args.gc, args.normal2, args.n, args.qlimit, args.qformat, args.hom, args.het, args.nproc, args.chunk, fileout, out_header))
res = multiprocessing.Process(target = DOpup2seqz, args = (args.normal, args.tumor, args.gc, args.normal2, args.n, args.qlimit, args.qformat, args.hom, args.het, fileout, out_header))
res.start()
res.join()
......
......@@ -10,6 +10,11 @@ bibentry(bibtype = 'Article',
as.person("Qiyuan Li"),
as.person("Zoltan Szallasi"),
as.person("Aron C Eklund")),
journal = "submitted for publication",
year = 2014
journal = "Annals of Oncology",
year = 2015,
volume = 26,
issue = 1,
pages = "64-70",
doi = "10.1093/annonc/mdu479",
url = "http://annonc.oxfordjournals.org/content/26/1/64"
)
......@@ -7,9 +7,9 @@
\documentclass[10pt,a4paper]{article}
%\usepackage{a4wide}
\usepackage{geometry}
%\usepackage{geometry}
\usepackage{hyperref}
\usepackage{subfigure}
%\usepackage{subfigure}
\usepackage{listings}
\usepackage{float}
\lstdefinestyle{BashInputStyle}{
......
No preview for this file type
......@@ -7,12 +7,13 @@
}
\usage{
VarScan2seqz(varscan.somatic, varscan.copynumber = NULL)
VarScan2seqz(varscan.somatic, varscan.copynumber = NULL, normal_var_freq = 0.25)
}
\arguments{
\item{varscan.somatic}{a data frame from the output file of \command{somatic}.}
\item{varscan.copynumber}{a data frame from the output file of \command{copynumber} (optional).}
\item{normal_var_freq}{defines the variant allele frequency threshold on the normal genotype to filter out false positive heterozygous position.}
}
\details{
......
......@@ -7,9 +7,9 @@
\documentclass[10pt,a4paper]{article}
%\usepackage{a4wide}
\usepackage{geometry}
%\usepackage{geometry}
\usepackage{hyperref}
\usepackage{subfigure}
%\usepackage{subfigure}
\usepackage{listings}
\usepackage{float}
\lstdefinestyle{BashInputStyle}{
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment