Skip to content

Instantly share code, notes, and snippets.

@emvolz
Created July 3, 2020 22:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save emvolz/56e8f88009c1c51c1bbfe3ecf1c5ba1c to your computer and use it in GitHub Desktop.
Save emvolz/56e8f88009c1c51c1bbfe3ecf1c5ba1c to your computer and use it in GitHub Desktop.
'
v3:
- using june 19 r1 data
- deltrans clusters n > 40
- fixing clock rate on rtt consensus
'
path = '2020-06-19.r1/deltrans_trees'
mdfn = '2020-06-19.r1/metadata/master.csv'
use_gtds = TRUE
minsize = 39
maxsize = Inf # will downsample to this value
nthreads = 6
ncpu = 7
ntrees = 20
SGSTART <- as.Date('2020-01-15' )
MHSTEPS <- 5e5
RES = 60
CLOCKRATE <- 0.0008254
TAU0 <- 1e-5
library( ape )
library( lubridate )
library( sarscov2 )
library( treedater )
trfns = list.files( patt='.tree$', path = path, full=TRUE )
lineage_names <- regmatches( trfns, regexpr( trfns, pattern='del_trans_([0-9]+)' ))
tres = lapply( trfns, read.tree )
# load metadata
md = readRDS( '2020-06-19.r1/2020-06-19.r1.master.rds' )
md$sample_time <- decimal_date( as.Date( md$collection_date ))
# sample times for each tree
tr2sts <- lapply( tres, function(tr){
setNames( md[ tr$tip , 'sample_time'] , tr$tip.label )
})
#fix seq id , remove samples with no collection date
tres <- lapply( 1:length(tres), function(k){
sts = tr2sts[[k]]
tr = tres[[k]]
todrop = names( sts ) [ is.na( sts ) ]
if ( length( todrop ) > (Ntip(tr)-3))
todrop <- todrop[1:(Ntip(tr)-3)]
drop.tip(tr, todrop )
})
names(tres ) = names(tr2sts) <- lineage_names
# filter out small clusters
ns = sapply( tres, Ntip )
print( sort( ns ))
keep <- (ns > minsize )
tres = tres[keep ]
tr2sts <- tr2sts [ names(tres ) ]
lineage_names = names( tres )
#' Generates multiple time trees from given ML tree; randomly resolves polytomies.
.cutie_treedater <- function (tr, ntres = 10, threads = 2, ncpu = 5, mrl = c(CLOCKRATE, CLOCKRATE+1E-5))
{
sts <- sapply(strsplit(tr$tip.label, "\\|"), function(x) {
as.numeric(tail(x, 2)[1])
})
names(sts) <- tr$tip.label
trpl <- tr
tr <- di2multi(tr, tol = 1e-05)
tres <- lapply(1:ntres, function(i) {
tr = unroot(multi2di(tr))
tr$edge.length <- pmax(1/29000/5, tr$edge.length)
tr
})
tds <- parallel::mclapply(tres, function(tr) {
dater(unroot(tr)
, sts[tr$tip.label]
, s = 29000
, meanRateLimits = mrl
, ncpu = ncpu
, searchRoot = ncpu - 1
)
}, mc.cores = threads)
tds
}
#' Computes time trees and skygrowth estimates
.lineage_skygrowth <- function(tr, sts)
{
# labels
tr3 = tr
tr3$tip.label <- paste(sep='|', tr3$tip.label, sts[tr3$tip.label], 'Il' )
# time trees
#tds = make_starting_trees( fastafn = tr3, ncpu = ncpu , ntres = ntrees )
st0 = Sys.time()
tds = .cutie_treedater(tr3, ntres = ntrees, threads = nthreads, ncpu = ncpu)
st1 <- Sys.time()
print( paste( 'treedater', st1 - st0 ))
# smooth node times
if ( use_gtds ){
a = capture.output({
gtds = parallel::mclapply( tds, function(td) gibbs_jitter( td, returnTrees=2 )[[2]] , mc.cores = ncpu*nthreads )
})
sg0 = skygrowth1( gtds, tau0 = TAU0, res = RES, ncpu = ncpu * nthreads, tstart = decimal_date(SGSTART), mhsteps = MHSTEPS)
} else{
sg0 = skygrowth1( tds, tau0 = TAU0, res = RES, ncpu = ncpu * nthreads, tstart = decimal_date(SGSTART), mhsteps = MHSTEPS)
gtds = tds
}
# skygrowth
sg0$GRmat <- sg0$GRmat[ , sample(1:ncol(sg0$GRmat), size = 50, replace=FALSE) ] # shrink file size
sg0$Rmat <- NULL
sg0$Ntip = Ntip( tr3 )
list( sg = sg0, gtds = gtds, tds = tds )
}
print( Sys.time() )
#~ st0 = system.time( { sg0 = .lineage_skygrowth(tres[[1]]) } ) #71 sec
dir.create( 'skygrowth3' )
# loop over ML trees, generate time trees and skygrowth estimates for each
sgs <- lapply( 1:length(tres), function(k){
tr = tres[[k]]
sts <- tr2sts[[k]]
ln <- names(tres)[k]
# downsample as needed
if ( Ntip( tr ) > maxsize ){
m = Ntip(tr) - maxsize
drop = sample( tr$tip.label, size = m, replace=FALSE)
tr = drop.tip( tr, drop )
sts = sts[ tr$tip.label ]
}
st0 = Sys.time()
o = .lineage_skygrowth(tr, sts )
sg = o$sg
gtds = o$gtds
tds = o$tds
st1 = Sys.time()
print( paste( 'skygrowth', st1 - st0 ))
saveRDS(sg, file = paste0('skygrowth3/skygrowth3-sg-', ln, '.rds' ) )
saveRDS(gtds, file = paste0('skygrowth3/skygrowth3-gtds-', ln, '.rds' ) )
saveRDS(tds, file = paste0('skygrowth3/skygrowth3-tds-', ln, '.rds' ) )
print( Sys.time() )
print( paste( lineage_names[k], 'complete' ) )
sg
})
names(sgs) <- lineage_names
saveRDS(sgs, file=paste0('skygrowth3/skygrowth3-sgs', '.rds' ) )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment