% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/sim.R
\name{SimulateMSeq}
\alias{SimulateMSeq}
\title{A Semiparametric Model-based Microbiome Sequencing Data Simulator for Cross-sectional and Case-control Studies}
\description{
The function generates synthetic microbiome sequencing data for studying the performance of differential abundance analysis methods.  
It uses a user-supplied (large) reference OTU table as a template to generate a synthetic OTU table of specified size. A subset of OTUs  are affected
by a simulated covariate of interest, either binary or continuous.  Confounder effects can also be simulated.  The function allows simulating
different signal structures, i.e., the percentage of differential OTUs, their effect sizes, their direction of change,  and whether these OTUs are
relatively abundant or rare.
}

\usage{
SimulateMSeq(
  ref.otu.tab,
  nSam = 100,
  nOTU = 500,
  diff.otu.pct = 0.1,
  diff.otu.direct = c("balanced", "unbalanced"),
  diff.otu.mode = c("abundant", "rare", "mix"),
  covariate.type = c("binary", "continuous"),
  grp.ratio = 1,
  covariate.eff.mean = 1,
  covariate.eff.sd = 0,
  confounder.type = c("none", "binary", "continuous", "both"),
  conf.cov.cor = 0.6,
  conf.diff.otu.pct = 0,
  conf.nondiff.otu.pct = 0.1,
  confounder.eff.mean = 0,
  confounder.eff.sd = 0,
  error.sd = 0,
  depth.mu = 10000,
  depth.theta = 5,
  depth.conf.factor = 0
)
}
\arguments{
\item{ref.otu.tab}{a matrix, the reference OTU count table (row - OTUs, column - samples), serving as the template for synthetic sample generation.}


\item{nSam}{the number of samples to be simulated.}

\item{nOTU}{the number of OTUs to be simulated. }

\item{diff.otu.pct}{a numeric value between 0 and 1, the percentage of differential OTUs to be simulated. If 0, global null setting is simulated. The default is 0.1.}

\item{diff.otu.direct}{a character string of "balanced" or "unbalanced".  "balanced" - the direction of change for these differential OTUs is random, "unbalanced" - direction of change is the same. The default is "balanced".}

\item{diff.otu.mode}{a character string of  "rare", "mix" or "abundant".  "abundant" - differential OTU come from the top quartile of the abundance distribution, 
"rare" - differential OTU come from the bottom quartile of the abundance distribution, and "mix" - random set. The default is "abundant". }

\item{covariate.type}{a character string of "binary" or "continuous", indicating the type of the covariate to be simulated. The default is "binary" (e.g., case v.s. control).}

\item{grp.ratio}{a numeric value between 0 and 1. Group size ratio. The default is 1, i.e., equal group size. Only relevant when \code{covariate.type} is "binary". }

\item{covariate.eff.mean}{a numeric value, the mean log fold change (effect size) in response to one unit change of the covariate.  The default is 1.}

\item{covariate.eff.sd}{a positive numeric value, the standard deviation of the log fold change.  The default is 0, i.e., the log fold change is the same across differential OTUs.}

\item{confounder.type}{a character string of "none", "binary", "continuous" or "both". The default is "none", no confounder will be simulated.
 If "both", both a binary and continuous confounder will be simulated. The default is "none".}

\item{conf.cov.cor}{a numeric value between 0 and 1.  The correlation between the covariate of interest and the confounder. The default is 0.6.}

\item{conf.diff.otu.pct}{a numeric value between 0 and 1. The percentage of OTUs affected by the confounder and the covariate of interest. The default is 0.}

\item{conf.nondiff.otu.pct}{a numeric value between 0 and 1. The percentage of OTUs affected by the confounder but not the covariate of interest. The default is 0.1.}

\item{confounder.eff.mean}{a numeric value, the mean log fold change (effect size) in response to one unit change of the confounder.  The default is 1.}

\item{confounder.eff.sd}{a positive numeric value, the standard deviation of the log fold change for the confounder.  The default is 0, i.e., the log fold change is the same across OTUs affected by the confounder.}

\item{error.sd}{the sd of the log fold change unexplained by the covariate and the confounder (i.e., the error term under the log linear model).  The default is 0. }

\item{depth.mu}{the mean sequencing depth to be simulated. The default is 10,000.}

\item{depth.theta}{the theta value  of the negative binomial distribution controlling the variance (\code{mu + mu^2/theta}). The default is 5.}

\item{depth.conf.factor}{a numeric value controlling the dependence of the sequencing depth on the covariate of interest (\code{depth.mu} * \code{exp(scale(X)} * \code{depth.conf.factor})). 
The default is 0, i.e., the depth is not associated with the covariate of interest. This parameter can be used to simulate depth confounding.}
}
\value{
Return a list with the elements:
\item{otu.tab.sim}{simulated OTU table}
\item{covariate}{simulated covariate of interest}
\item{confounder}{simulated confounder(s)}
\item{diff.otu.ind}{indices of the differential OTUs, i.e., affected by the covariate of interest}
\item{otu.names}{the names of the simulated OTUs}
\item{conf.otu.ind}{indices of OTUs affected by the confounder(s)}
}

\details{

This function implements a semiparametric approach for realistic independent microbiome sequencing data generation. The method draws random samples from a large reference dataset (non-parametric part) and uses
 these reference samples as templates to generate new samples (parametric part). Specifically, for each drawn reference sample, it infers the underlying composition based on a Bayesian model and 
 then adds covariate/confounder effects to the composition vector, based on which a new sequencing sample is generated. The method circumvents the difficulty in modeling the inter-subject variation
 of the microbiome composition.   

}
\references{
Yang, L. & Chen, J. 2022. A comprehensive evaluation of differential abundance analysis methods: current status and potential solutions.
Microbiome. In Press.
}


\author{Jun Chen and Lu Yang}

\examples{

# Use throat microbiome for illustration
data(throat.otu.tab)
comm <- t(throat.otu.tab)
comm <- comm[rowMeans(comm != 0) > 0.2, ]

# Simulate binary covariate, 10\% signal density, abundant differential OTUs, unbalanced change 
# This setting simulates strong compositional effects
sim.obj <- SimulateMSeq(
		ref.otu.tab = comm, nSam = 50, nOTU = 50,
		# True signal setting
		diff.otu.pct = 0.1, diff.otu.direct = c("unbalanced"), 
		diff.otu.mode = c("abundant"),
		covariate.type = c("binary"), grp.ratio = 1,
		covariate.eff.mean = 1.0, covariate.eff.sd = 0,
		# Confounder signal setting
		confounder.type = c("both"), conf.cov.cor = 0.6,
		conf.diff.otu.pct = 0.1, conf.nondiff.otu.pct = 0.1,
		confounder.eff.mean = 1.0, confounder.eff.sd = 0,
		# Depth setting
		depth.mu = 10000, depth.theta = 5, depth.conf.factor = 0
)

meta.dat <- data.frame(X = sim.obj$covariate, Z1 = sim.obj$confounder[, 1],
                       Z2 = sim.obj$confounder[, 2])
otu.tab.sim <- sim.obj$otu.tab.sim

# Run ZicoSeq for differential abundance analysis
zico.obj <- ZicoSeq(meta.dat = meta.dat, feature.dat = otu.tab.sim, 
		grp.name = 'X', adj.name = c('Z1', 'Z2'), feature.dat.type = "count",
		# Filter to remove rare taxa
		prev.filter = 0.2, mean.abund.filter = 0,  max.abund.filter = 0.002, min.prop = 0, 
		# Winsorization to replace outliers
		is.winsor = TRUE, outlier.pct = 0.03, winsor.end = 'top',
		# Posterior sampling to impute zeros
		is.post.sample = TRUE, post.sample.no = 25, 
		# Multiple link functions to capture diverse taxon-covariate relation
		link.func = list(function (x) x^0.25, function (x) x^0.5, function (x) x^0.75), 
		stats.combine.func = max,
		# Permutation-based multiple testing correction
		perm.no = 99,  strata = NULL, 
		# Reference-based multiple stage normalization
		ref.pct = 0.5, stage.no = 6, excl.pct = 0.2,
		# Family-wise error rate control
		is.fwer = FALSE,
		verbose = TRUE, return.feature.dat = FALSE)

# Detected differential OTUs
which(zico.obj$p.adj.fdr <= 0.05)

# True differential OTUs
sim.obj$otu.names[sim.obj$diff.otu.ind]
}

\keyword{composition}
\keyword{microbiome}
\keyword{simulation}

