Ealier today, I created the necessary Hisat2 index files for cbai_transcriptome_v3.1
. Next, I needed to actually get the alignments run. The alignments were performed using HISAT2
on Mox using the following set of trimmed FastQ files from 2020414:
- 380822
- 380823
- 380824
- 380825
See Nightingales
(Google Sheet) for more info on those RNAseq data.
SBATCH script (GitHub):
#!/bin/bash
## Job Name
#SBATCH --job-name=20210908-cbai-hisat2-cbai_transcriptome_v3.1
## Allocation Definition
#SBATCH --account=srlab
#SBATCH --partition=srlab
## Resources
## Nodes
#SBATCH --nodes=1
## Walltime (days-hours:minutes:seconds format)
#SBATCH --time=5-00:00:00
## Memory per node
#SBATCH --mem=500G
##turn on e-mail notification
#SBATCH --mail-type=ALL
#SBATCH --mail-user=samwhite@uw.edu
## Specify the working directory for this job
#SBATCH --chdir=/gscratch/scrubbed/samwhite/outputs/20210908-cbai-hisat2-cbai_transcriptome_v3.1
## Hisat2 alignment of C.bairdi RNAseq to cbai_transcriptome_v3.1 transcriptome assembly
## using HiSat2 index generated on 20210908.
## Expects FastQ input filenames to match *R[12]*.fq.gz.
###################################################################################
# These variables need to be set by user
## Assign Variables
# Set number of CPUs to use
threads=28
# Index name for Hisat2 use
# Needs to match index naem used in previous Hisat2 indexing step
transcriptome_index_name="cbai-transcriptome_v3.1"
# Set associative array of sample names and metadata
declare -A samples_associative_array=(
[380822]=d2_uninfected_decreased-temp \
[380823]=d2_infected_decreased-temp \
[380824]=d2_uninfected_elevated-temp \
[380825]=d2_infected_elevated-temp
)
# Paths to programs
hisat2_dir="/gscratch/srlab/programs/hisat2-2.1.0"
hisat2="${hisat2_dir}/hisat2"
samtools="/gscratch/srlab/programs/samtools-1.10/samtools"
# Input/output files
transcriptome_index_dir="/gscratch/srlab/sam/data/C_bairdi/transcriptomes"
fastq_dir="/gscratch/srlab/sam/data/C_bairdi/RNAseq/"
# Programs associative array
declare -A programs_array
programs_array=(
[hisat2]="${hisat2}" \
[samtools_index]="${samtools} index" \
[samtools_sort]="${samtools} sort" \
[samtools_view]="${samtools} view"
)
###################################################################################################
# Exit script if any command fails
set -e
# Load Python Mox module for Python module availability
module load intel-python3_2017
# Copy Hisat2 transcriptome index files
rsync -av "${transcriptome_index_dir}"/${transcriptome_index_name}*.ht2 .
for sample in "${!samples_associative_array[@]}"
do
## Inititalize arrays
fastq_array_R1=()
fastq_array_R2=()
# Create array of fastq R1 files
# and generated MD5 checksums file.
for fastq in "${fastq_dir}""${sample}"*R1*.gz
do
fastq_array_R1+=("${fastq}")
echo "Generating checksum for ${fastq}..."
md5sum "${fastq}" >> input_fastqs_checksums.md5
echo "Checksum for ${fastq} completed."
echo ""
done
# Create array of fastq R2 files
for fastq in "${fastq_dir}""${sample}"*R2*.gz
do
fastq_array_R2+=("${fastq}")
echo "Generating checksum for ${fastq}..."
md5sum "${fastq}" >> input_fastqs_checksums.md5
echo "Checksum for ${fastq} completed."
echo ""
done
# Create comma-separated lists of FastQs for Hisat2
printf -v joined_R1 '%s,' "${fastq_array_R1[@]}"
fastq_list_R1=$(echo "${joined_R1%,}")
printf -v joined_R2 '%s,' "${fastq_array_R2[@]}"
fastq_list_R2=$(echo "${joined_R2%,}")
# Hisat2 alignments
"${programs_array[hisat2]}" \
-x "${transcriptome_index_name}" \
-1 "${fastq_list_R1}" \
-2 "${fastq_list_R2}" \
-S "${sample}".sam \
--rg-id "${sample}" \
--rg "SM:""${samples_associative_array[$sample]}" \
2> "${sample}"_hisat2.err
# Sort SAM files, convert to BAM, and index
${programs_array[samtools_view]} \
-@ "${threads}" \
-Su "${sample}".sam \
| ${programs_array[samtools_sort]} - \
-@ "${threads}" \
-o "${sample}".sorted.bam
${programs_array[samtools_index]} "${sample}".sorted.bam
done
# Delete unneccessary index files
rm "${transcriptome_index_name}"*.ht2
# Delete unneeded SAM files
rm ./*.sam
# Generate checksums
for file in *
do
md5sum "${file}" >> checksums.md5
done
#######################################################################################################
# Capture program options
if [[ "${#programs_array[@]}" -gt 0 ]]; then
echo "Logging program options..."
for program in "${!programs_array[@]}"
do
{
echo "Program options for ${program}: "
echo ""
# Handle samtools help menus
if [[ "${program}" == "samtools_index" ]] \
|| [[ "${program}" == "samtools_sort" ]] \
|| [[ "${program}" == "samtools_view" ]]
then
${programs_array[$program]}
# Handle DIAMOND BLAST menu
elif [[ "${program}" == "diamond" ]]; then
${programs_array[$program]} help
# Handle NCBI BLASTx menu
elif [[ "${program}" == "blastx" ]]; then
${programs_array[$program]} -help
fi
${programs_array[$program]} -h
echo ""
echo ""
echo "----------------------------------------------"
echo ""
echo ""
} &>> program_options.log || true
# If MultiQC is in programs_array, copy the config file to this directory.
if [[ "${program}" == "multiqc" ]]; then
cp --preserve ~/.multiqc_config.yaml multiqc_config.yaml
fi
done
echo "Finished logging programs options."
echo ""
fi
# Document programs in PATH (primarily for program version ID)
echo "Logging system $PATH..."
{
date
echo ""
echo "System PATH for $SLURM_JOB_ID"
echo ""
printf "%0.s-" {1..10}
echo "${PATH}" | tr : \\n
} >> system_path.log
echo "Finished logging system $PATH."
RESULTS
Runtime was ~2.5hrs:
Output folder:
20210908-cbai-hisat2-cbai_transcriptome_v3.1/
BAM files:
380822.sorted.bam (2.3G)
- MD5:
a4c0e147877327def00eda11be86deb9
- MD5:
380823.sorted.bam (2.2G)
- MD5:
a23cf8a0d60a847678ad3edcbef686fd
- MD5:
380824.sorted.bam (2.5G)
- MD5:
1f8c30a8ecf079d9d13ebab306f9506a
- MD5:
380825.sorted.bam (2.6G)
- MD5:
d0331a572821e4f16757acbfb6c410b4
- MD5:
Input FastQ list/checksums (text):
And, now that the alignments are completed, next up will be making variant calls using bcftools.