Ealier today, I created the necessary Hisat2 index files for cbai_transcriptome_v3.1. Next, I needed to actually get the alignments run. The alignments were performed using HISAT2 on Mox using the following set of trimmed FastQ files from 2020414:

380822
380823
380824
380825

See Nightingales (Google Sheet) for more info on those RNAseq data.

SBATCH script (GitHub):

20210908-cbai-hisat2-cbai_transcriptome_v3.1.sh

#!/bin/bash
## Job Name
#SBATCH --job-name=20210908-cbai-hisat2-cbai_transcriptome_v3.1
## Allocation Definition
#SBATCH --account=srlab
#SBATCH --partition=srlab
## Resources
## Nodes
#SBATCH --nodes=1
## Walltime (days-hours:minutes:seconds format)
#SBATCH --time=5-00:00:00
## Memory per node
#SBATCH --mem=500G
##turn on e-mail notification
#SBATCH --mail-type=ALL
#SBATCH --mail-user=samwhite@uw.edu
## Specify the working directory for this job
#SBATCH --chdir=/gscratch/scrubbed/samwhite/outputs/20210908-cbai-hisat2-cbai_transcriptome_v3.1

## Hisat2 alignment of C.bairdi RNAseq to cbai_transcriptome_v3.1 transcriptome assembly
## using HiSat2 index generated on 20210908.

## Expects FastQ input filenames to match *R[12]*.fq.gz.


###################################################################################
# These variables need to be set by user

## Assign Variables

# Set number of CPUs to use
threads=28

# Index name for Hisat2 use
# Needs to match index naem used in previous Hisat2 indexing step
transcriptome_index_name="cbai-transcriptome_v3.1"

# Set associative array of sample names and metadata
declare -A samples_associative_array=(
[380822]=d2_uninfected_decreased-temp \
[380823]=d2_infected_decreased-temp \
[380824]=d2_uninfected_elevated-temp \
[380825]=d2_infected_elevated-temp
)

# Paths to programs
hisat2_dir="/gscratch/srlab/programs/hisat2-2.1.0"
hisat2="${hisat2_dir}/hisat2"
samtools="/gscratch/srlab/programs/samtools-1.10/samtools"


# Input/output files
transcriptome_index_dir="/gscratch/srlab/sam/data/C_bairdi/transcriptomes"
fastq_dir="/gscratch/srlab/sam/data/C_bairdi/RNAseq/"


# Programs associative array
declare -A programs_array
programs_array=(
[hisat2]="${hisat2}" \
[samtools_index]="${samtools} index" \
[samtools_sort]="${samtools} sort" \
[samtools_view]="${samtools} view"
)


###################################################################################################

# Exit script if any command fails
set -e

# Load Python Mox module for Python module availability

module load intel-python3_2017


# Copy Hisat2 transcriptome index files
rsync -av "${transcriptome_index_dir}"/${transcriptome_index_name}*.ht2 .

for sample in "${!samples_associative_array[@]}"
do

  ## Inititalize arrays
  fastq_array_R1=()
  fastq_array_R2=()

  # Create array of fastq R1 files
  # and generated MD5 checksums file.
  for fastq in "${fastq_dir}""${sample}"*R1*.gz
  do
    fastq_array_R1+=("${fastq}")
    echo "Generating checksum for ${fastq}..."
    md5sum "${fastq}" >> input_fastqs_checksums.md5
    echo "Checksum for ${fastq} completed."
    echo ""
  done

  # Create array of fastq R2 files
  for fastq in "${fastq_dir}""${sample}"*R2*.gz
  do
    fastq_array_R2+=("${fastq}")
    echo "Generating checksum for ${fastq}..."
    md5sum "${fastq}" >> input_fastqs_checksums.md5
    echo "Checksum for ${fastq} completed."
    echo ""
  done

  # Create comma-separated lists of FastQs for Hisat2
  printf -v joined_R1 '%s,' "${fastq_array_R1[@]}"
  fastq_list_R1=$(echo "${joined_R1%,}")

  printf -v joined_R2 '%s,' "${fastq_array_R2[@]}"
  fastq_list_R2=$(echo "${joined_R2%,}")


  # Hisat2 alignments
  "${programs_array[hisat2]}" \
  -x "${transcriptome_index_name}" \
  -1 "${fastq_list_R1}" \
  -2 "${fastq_list_R2}" \
  -S "${sample}".sam \
  --rg-id "${sample}" \
  --rg "SM:""${samples_associative_array[$sample]}" \
  2> "${sample}"_hisat2.err

  # Sort SAM files, convert to BAM, and index
  ${programs_array[samtools_view]} \
  -@ "${threads}" \
  -Su "${sample}".sam \
  | ${programs_array[samtools_sort]} - \
  -@ "${threads}" \
  -o "${sample}".sorted.bam
  ${programs_array[samtools_index]} "${sample}".sorted.bam

done

# Delete unneccessary index files
rm "${transcriptome_index_name}"*.ht2

# Delete unneeded SAM files
rm ./*.sam

# Generate checksums
for file in *
do
  md5sum "${file}" >> checksums.md5
done

#######################################################################################################

# Capture program options
if [[ "${#programs_array[@]}" -gt 0 ]]; then
  echo "Logging program options..."
  for program in "${!programs_array[@]}"
  do
    {
    echo "Program options for ${program}: "
    echo ""
    # Handle samtools help menus
    if [[ "${program}" == "samtools_index" ]] \
    || [[ "${program}" == "samtools_sort" ]] \
    || [[ "${program}" == "samtools_view" ]]
    then
      ${programs_array[$program]}

    # Handle DIAMOND BLAST menu
    elif [[ "${program}" == "diamond" ]]; then
      ${programs_array[$program]} help

    # Handle NCBI BLASTx menu
    elif [[ "${program}" == "blastx" ]]; then
      ${programs_array[$program]} -help
    fi
    ${programs_array[$program]} -h
    echo ""
    echo ""
    echo "----------------------------------------------"
    echo ""
    echo ""
  } &>> program_options.log || true

    # If MultiQC is in programs_array, copy the config file to this directory.
    if [[ "${program}" == "multiqc" ]]; then
      cp --preserve ~/.multiqc_config.yaml multiqc_config.yaml
    fi
  done
  echo "Finished logging programs options."
  echo ""
fi


# Document programs in PATH (primarily for program version ID)
echo "Logging system $PATH..."
{
date
echo ""
echo "System PATH for $SLURM_JOB_ID"
echo ""
printf "%0.s-" {1..10}
echo "${PATH}" | tr : \\n
} >> system_path.log
echo "Finished logging system $PATH."

RESULTS

Runtime was ~2.5hrs:

Runtime for Hisat2 alignment of RNAseq data to cbai_transcriptome_v3.1

Output folder:

20210908-cbai-hisat2-cbai_transcriptome_v3.1/
- BAM files:
  - 380822.sorted.bam (2.3G)
    - MD5: a4c0e147877327def00eda11be86deb9
  - 380823.sorted.bam (2.2G)
    - MD5: a23cf8a0d60a847678ad3edcbef686fd
  - 380824.sorted.bam (2.5G)
    - MD5: 1f8c30a8ecf079d9d13ebab306f9506a
  - 380825.sorted.bam (2.6G)
    - MD5: d0331a572821e4f16757acbfb6c410b4
- Input FastQ list/checksums (text):
  - 20210908-cbai-hisat2-cbai_transcriptome_v3.1/input_fastqs_checksums.md5

And, now that the alignments are completed, next up will be making variant calls using bcftools.

RESULTS

BAM files:

Input FastQ list/checksums (text):