#!/bin/bash

#SBATCH --job-name="ALL"                                # name of the job submitted
#SBATCH -p mem                                          # name of the queue you are submitting to
#SBATCH -n 40
#SBATCH -N 1
#SBATCH --mem=1000G                                     # number of cores/tasks in this job, you get all 20 cores with 2 threads per core with hyperthreading
#SBATCH -t 47:00:00                                     # time allocated for this job hours:mins:seconds
#SBATCH --mail-user=                                    # will receive an email when job starts, ends or fails
#SBATCH --mail-type=BEGIN,END,FAIL                      # will receive an email when job starts, ends or fails
#SBATCH -o "stdout.%j.%N"                               # standard out %j adds job number to outputfile name and %N adds the node name
#SBATCH -e "stderr.%j.%N"                               # optional but it prints our standard error

# ENTER COMMANDS HERE:

module load java/1.8.0_121

set -e # I think this means if anything fails it immediately exits and doesnt keep plowing ahead?

#Written by Brian Bushnell
#Last updated March 4, 2019

#This script is designed to preprocess data for assembly of overlapping 2x150bp reads from Illumina HiSeq 2500.
#Some numbers and steps may need adjustment for different data types or file paths.
#For large genomes, tadpole and bbmerge (during the "Merge" phase) may need the flag "prefilter=2" to avoid running out of memory.
#"prefilter" makes these take twice as long though so don't use it if you have enough memory.
#The "rm ALL_temp.fq.gz; ln -s reads.fq.gz ALL_temp.fq.gz" is not necessary but added so that any pipeline stage can be easily disabled,
#without affecting the input file name of the next stage.

#interleave reads
#reformat.sh in=ALL_R#.fastq.gz out=ALL.fq.gz


#Link the interleaved input file as "ALL_temp.fq.gz"
ln -s ALL.fq.gz ALL_temp.fq.gz

# --- Preprocessing ---

#Remove optical duplicates
#clumpify.sh in=ALL_temp.fq.gz out=ALL.clumped.fq.gz dedupe optical
#rm ALL_temp.fq.gz; ln -s ALL.clumped.fq.gz ALL_temp.fq.gz

#Remove low-quality regions
#filterbytile.sh in=ALL_temp.fq.gz out=ALL.filtered_by_tile.fq.gz
#rm ALL_temp.fq.gz; ln -s ALL.filtered_by_tile.fq.gz ALL_temp.fq.gz

#Trim adapters.  Optionally, reads with Ns can be discarded by adding "maxns=0" and reads with really low average quality can be discarded with "maq=8".
#bbduk.sh in=ALL_temp.fq.gz out=ALL.trimmed.fq.gz ktrim=r k=23 mink=11 hdist=1 tbo tpe minlen=70 ref=adapters ftm=5 ordered
#rm ALL_temp.fq.gz; ln -s ALL.trimmed.fq.gz ALL_temp.fq.gz

#Remove synthetic artifacts and spike-ins by kmer-matching.
#bbduk.sh in=ALL_temp.fq.gz out=ALL.filtered.fq.gz k=31 ref=artifacts,phix ordered cardinality
#rm ALL_temp.fq.gz; ln -s ALL.filtered.fq.gz ALL_temp.fq.gz

#Decontamination by mapping can be done here.
#JGI removes these in two phases:
#1) common microbial contaminants (E.coli, Pseudomonas, Delftia, others)
#2) common animal contaminants (Human, cat, dog, mouse)


#Error-correct phase 1
bbmerge.sh in=ALL_temp.fq.gz out=ALL.ecco.fq.gz ecco mix vstrict ordered ihist=ALL_ihist_merge1.txt
rm ALL_temp.fq.gz; ln -s ALL.ecco.fq.gz ALL_temp.fq.gz

#Error-correct phase 2
clumpify.sh in=ALL_temp.fq.gz out=ALL.eccc.fq.gz ecc passes=4 reorder
rm ALL_temp.fq.gz; ln -s ALL.eccc.fq.gz ALL_temp.fq.gz

#Error-correct phase 3
#Low-depth reads can be discarded here with the "tossjunk", "tossdepth", or "tossuncorrectable" flags.
#For very large datasets, "prefilter=1" or "prefilter=2" can be added to conserve memory.
#Alternatively, bbcms.sh can be used if Tadpole still runs out of memory.
tadpole.sh in=ALL_temp.fq.gz out=ALL.ecct.fq.gz ecc k=62 ordered tossjunk prefilter=2
rm ALL_temp.fq.gz; ln -s ALL.ecct.fq.gz ALL_temp.fq.gz


#Normalize
#This phase can be very beneficial for data with uneven coverage like metagenomes, MDA-amplified single cells, and RNA-seq, but is not usually recommended for isolate DNA.
#So normally, this stage should be commented out, as it is here.
bbnorm.sh in=ALL_temp.fq.gz out=ALL.normalized.fq.gz target=100 hist=ALL_khist.txt peaks=ALL_peaks.txt
rm ALL_temp.fq.gz; ln -s ALL.normalized.fq.gz ALL_temp.fq.gz

#Merge
#This phase handles overlapping reads,
#and also nonoverlapping reads, if there is sufficient coverage and sufficiently short inter-read gaps
#For very large datasets, "prefilter=1" or "prefilter=2" can be added to conserve memory.
#bbmerge-auto.sh in=ALL_temp.fq.gz out=merged.fq.gz outu=unmerged.fq.gz strict k=93 extend2=80 rem ordered ihist=ihist_merge.txt

#Quality-trim the unmerged reads.
#bbduk.sh in=unmerged.fq.gz out=qtrimmed.fq.gz qtrim=r trimq=10 minlen=70 ordered

# --- Assembly ---

#You do not need to assemble with all assemblers, but I have listed the commands for the 3 I use most often

#Assemble with Tadpole
#For very large datasets, "prefilter=1" or "prefilter=2" can be added to conserve memory.
#tadpole.sh in=merged.fq.gz,qtrimmed.fq.gz out=tadpole_contigs.fa k=124

#Or assemble with TadWrapper (which automatically finds the best value of K but takes longer)
#tadwrapper.sh in=merged.fq.gz,qtrimmed.fq.gz out=tadwrapper_contigs_%.fa outfinal=tadwrapper_contigs k=40,124,217 bisect

#Assemble with Spades
#spades.py -k 25,55,95,125 --phred-offset 33 -s merged.fq.gz --12 qtrimmed.fq.gz -o spades_out

#Assemble with Megahit
#Note that the above error correction phases tend to not be beneficial for Megahit
#megahit --k-min 45 --k-max 225 --k-step 26 --min-count 2 -r merged.fq.gz --12 qtrimmed.fq.gz -o megahit_out

# --- Evaluation ---

#Evaluate assemblies with AssemblyStats
#statswrapper.sh contigs.fa spades_out/scaffolds.fasta megahit_out/contigs.fa format=3 out=

#Evaluate assemblies with Quast (leave out "-R ref.fa if you don't have a reference)
#quast.py -f -o quast -R ref.fa tadpole_contigs.fa spades_out/scaffolds.fasta megahit_out/contigs.fa

#Pick which assembly you like best

#Determine the overall taxonomic makeup of the assembly
#sendsketch.sh in=tadpole_contigs.fa

#Or, try to determine taxonomy on a per-contig basis.  If this is not sensitive enough, try BLAST instead.
#sendsketch.sh in=tadpole_contigs.fa persequence minhits=1 records=4

#Calculate the coverage distribution, and capture reads that did not make it into the assembly
#bbmap.sh in=filtered.fq.gz ref=tadpole_contigs.fa nodisk covhist=covhist.txt covstats=covstats.txt outm=assembled.fq.gz outu=unassembled.fq.gz maxindel=200 minid=90 qtrim=10 untrim ambig=all