Created
July 3, 2020 22:03
-
-
Save emvolz/56e8f88009c1c51c1bbfe3ecf1c5ba1c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
' | |
v3: | |
- using june 19 r1 data | |
- deltrans clusters n > 40 | |
- fixing clock rate on rtt consensus | |
' | |
path = '2020-06-19.r1/deltrans_trees' | |
mdfn = '2020-06-19.r1/metadata/master.csv' | |
use_gtds = TRUE | |
minsize = 39 | |
maxsize = Inf # will downsample to this value | |
nthreads = 6 | |
ncpu = 7 | |
ntrees = 20 | |
SGSTART <- as.Date('2020-01-15' ) | |
MHSTEPS <- 5e5 | |
RES = 60 | |
CLOCKRATE <- 0.0008254 | |
TAU0 <- 1e-5 | |
library( ape ) | |
library( lubridate ) | |
library( sarscov2 ) | |
library( treedater ) | |
trfns = list.files( patt='.tree$', path = path, full=TRUE ) | |
lineage_names <- regmatches( trfns, regexpr( trfns, pattern='del_trans_([0-9]+)' )) | |
tres = lapply( trfns, read.tree ) | |
# load metadata | |
md = readRDS( '2020-06-19.r1/2020-06-19.r1.master.rds' ) | |
md$sample_time <- decimal_date( as.Date( md$collection_date )) | |
# sample times for each tree | |
tr2sts <- lapply( tres, function(tr){ | |
setNames( md[ tr$tip , 'sample_time'] , tr$tip.label ) | |
}) | |
#fix seq id , remove samples with no collection date | |
tres <- lapply( 1:length(tres), function(k){ | |
sts = tr2sts[[k]] | |
tr = tres[[k]] | |
todrop = names( sts ) [ is.na( sts ) ] | |
if ( length( todrop ) > (Ntip(tr)-3)) | |
todrop <- todrop[1:(Ntip(tr)-3)] | |
drop.tip(tr, todrop ) | |
}) | |
names(tres ) = names(tr2sts) <- lineage_names | |
# filter out small clusters | |
ns = sapply( tres, Ntip ) | |
print( sort( ns )) | |
keep <- (ns > minsize ) | |
tres = tres[keep ] | |
tr2sts <- tr2sts [ names(tres ) ] | |
lineage_names = names( tres ) | |
#' Generates multiple time trees from given ML tree; randomly resolves polytomies. | |
.cutie_treedater <- function (tr, ntres = 10, threads = 2, ncpu = 5, mrl = c(CLOCKRATE, CLOCKRATE+1E-5)) | |
{ | |
sts <- sapply(strsplit(tr$tip.label, "\\|"), function(x) { | |
as.numeric(tail(x, 2)[1]) | |
}) | |
names(sts) <- tr$tip.label | |
trpl <- tr | |
tr <- di2multi(tr, tol = 1e-05) | |
tres <- lapply(1:ntres, function(i) { | |
tr = unroot(multi2di(tr)) | |
tr$edge.length <- pmax(1/29000/5, tr$edge.length) | |
tr | |
}) | |
tds <- parallel::mclapply(tres, function(tr) { | |
dater(unroot(tr) | |
, sts[tr$tip.label] | |
, s = 29000 | |
, meanRateLimits = mrl | |
, ncpu = ncpu | |
, searchRoot = ncpu - 1 | |
) | |
}, mc.cores = threads) | |
tds | |
} | |
#' Computes time trees and skygrowth estimates | |
.lineage_skygrowth <- function(tr, sts) | |
{ | |
# labels | |
tr3 = tr | |
tr3$tip.label <- paste(sep='|', tr3$tip.label, sts[tr3$tip.label], 'Il' ) | |
# time trees | |
#tds = make_starting_trees( fastafn = tr3, ncpu = ncpu , ntres = ntrees ) | |
st0 = Sys.time() | |
tds = .cutie_treedater(tr3, ntres = ntrees, threads = nthreads, ncpu = ncpu) | |
st1 <- Sys.time() | |
print( paste( 'treedater', st1 - st0 )) | |
# smooth node times | |
if ( use_gtds ){ | |
a = capture.output({ | |
gtds = parallel::mclapply( tds, function(td) gibbs_jitter( td, returnTrees=2 )[[2]] , mc.cores = ncpu*nthreads ) | |
}) | |
sg0 = skygrowth1( gtds, tau0 = TAU0, res = RES, ncpu = ncpu * nthreads, tstart = decimal_date(SGSTART), mhsteps = MHSTEPS) | |
} else{ | |
sg0 = skygrowth1( tds, tau0 = TAU0, res = RES, ncpu = ncpu * nthreads, tstart = decimal_date(SGSTART), mhsteps = MHSTEPS) | |
gtds = tds | |
} | |
# skygrowth | |
sg0$GRmat <- sg0$GRmat[ , sample(1:ncol(sg0$GRmat), size = 50, replace=FALSE) ] # shrink file size | |
sg0$Rmat <- NULL | |
sg0$Ntip = Ntip( tr3 ) | |
list( sg = sg0, gtds = gtds, tds = tds ) | |
} | |
print( Sys.time() ) | |
#~ st0 = system.time( { sg0 = .lineage_skygrowth(tres[[1]]) } ) #71 sec | |
dir.create( 'skygrowth3' ) | |
# loop over ML trees, generate time trees and skygrowth estimates for each | |
sgs <- lapply( 1:length(tres), function(k){ | |
tr = tres[[k]] | |
sts <- tr2sts[[k]] | |
ln <- names(tres)[k] | |
# downsample as needed | |
if ( Ntip( tr ) > maxsize ){ | |
m = Ntip(tr) - maxsize | |
drop = sample( tr$tip.label, size = m, replace=FALSE) | |
tr = drop.tip( tr, drop ) | |
sts = sts[ tr$tip.label ] | |
} | |
st0 = Sys.time() | |
o = .lineage_skygrowth(tr, sts ) | |
sg = o$sg | |
gtds = o$gtds | |
tds = o$tds | |
st1 = Sys.time() | |
print( paste( 'skygrowth', st1 - st0 )) | |
saveRDS(sg, file = paste0('skygrowth3/skygrowth3-sg-', ln, '.rds' ) ) | |
saveRDS(gtds, file = paste0('skygrowth3/skygrowth3-gtds-', ln, '.rds' ) ) | |
saveRDS(tds, file = paste0('skygrowth3/skygrowth3-tds-', ln, '.rds' ) ) | |
print( Sys.time() ) | |
print( paste( lineage_names[k], 'complete' ) ) | |
sg | |
}) | |
names(sgs) <- lineage_names | |
saveRDS(sgs, file=paste0('skygrowth3/skygrowth3-sgs', '.rds' ) ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment