INTRO
In preparation for RNA-seq analysis for urol-e5/timeseries_molecular (GitHub repo) project, we needed to index the genome using HISAT2
. I had previously fixed the GFF and prepared a corresponding GTF on 20250114 so that we could identify exons splice sites - both of which are useful during genome indexing to identify alternative isoforms in downstream applications.
The contents below are from markdown knitted from 02.10-F-Ptua-RNAseq-genome-index-HiSat2.Rmd
(commit 81c1117
).
1 Background
This notebook will build an index of the P.tuahiniensis genome using HISAT2 (Kim et al. 2019). It utilizes the GTF file created in 00-genome-GFF-formatting.Rmd
.
Due to the large file sizes of the outputs HISAT2 index files (*.ht2
), they cannot be added to GitHub. As such, they are available for download from here:
2 Create a Bash variables file
This allows usage of Bash variables across R Markdown chunks.
{
echo "#### Assign Variables ####"
echo ""
echo "# Data directories"
echo 'export timeseries_dir=/home/shared/8TB_HDD_01/sam/gitrepos/urol-e5/timeseries_molecular'
echo 'export genome_dir="${timeseries_dir}/F-Ptua/data"'
echo 'export output_dir_top=${timeseries_dir}/F-Ptua/output/02.10-F-Ptua-RNAseq-genome-index-HiSat2'
echo ""
echo "# Input/output files"
echo 'export genome_index_name="Pocillopora_meandrina_HIv1.assembly"'
echo 'export exons="${output_dir_top}/Pocillopora_meandrina_HIv1.assembly_hisat2_exons.tab"'
echo 'export genome_gff="${genome_dir}/Pocillopora_meandrina_HIv1.genes-validated.gff3"'
echo 'export genome_fasta="${genome_dir}/Pocillopora_meandrina_HIv1.assembly.fasta"'
echo 'export splice_sites="${output_dir_top}/Pocillopora_meandrina_HIv1.assembly_hisat2_splice_sites.tab"'
echo 'export transcripts_gtf="${genome_dir}/Pocillopora_meandrina_HIv1.genes-validated.gtf"'
echo "# Paths to programs"
echo 'export programs_dir="/home/shared"'
echo 'export hisat2_dir="${programs_dir}/hisat2-2.2.1"'
echo ""
echo 'export hisat2_build="${hisat2_dir}/hisat2-build"'
echo 'export hisat2_exons="${hisat2_dir}/hisat2_extract_exons.py"'
echo 'export hisat2_splice_sites="${hisat2_dir}/hisat2_extract_splice_sites.py"'
echo ""
echo "# Set number of CPUs to use"
echo 'export threads=40'
echo ""
echo "# Programs associative array"
echo "declare -A programs_array"
echo "programs_array=("
echo '[hisat2]="${hisat2}" \'
echo '[hisat2_build]="${hisat2_build}" \'
echo '[hisat2_exons]="${hisat2_exons}" \'
echo '[hisat2_splice_sites]="${hisat2_splice_sites}" \'
echo ")"
echo ""
echo "# Print formatting"
echo 'export line="--------------------------------------------------------"'
echo ""
} > .bashvars
cat .bashvars
#### Assign Variables ####
# Data directories
export timeseries_dir=/home/shared/8TB_HDD_01/sam/gitrepos/urol-e5/timeseries_molecular
export genome_dir="${timeseries_dir}/F-Ptua/data"
export output_dir_top=${timeseries_dir}/F-Ptua/output/02.10-F-Ptua-RNAseq-genome-index-HiSat2
# Input/output files
export genome_index_name="Pocillopora_meandrina_HIv1.assembly"
export exons="${output_dir_top}/Pocillopora_meandrina_HIv1.assembly_hisat2_exons.tab"
export genome_gff="${genome_dir}/Pocillopora_meandrina_HIv1.genes-validated.gff3"
export genome_fasta="${genome_dir}/Pocillopora_meandrina_HIv1.assembly.fasta"
export splice_sites="${output_dir_top}/Pocillopora_meandrina_HIv1.assembly_hisat2_splice_sites.tab"
export transcripts_gtf="${genome_dir}/Pocillopora_meandrina_HIv1.genes-validated.gtf"
# Paths to programs
export programs_dir="/home/shared"
export hisat2_dir="${programs_dir}/hisat2-2.2.1"
export hisat2_build="${hisat2_dir}/hisat2-build"
export hisat2_exons="${hisat2_dir}/hisat2_extract_exons.py"
export hisat2_splice_sites="${hisat2_dir}/hisat2_extract_splice_sites.py"
# Set number of CPUs to use
export threads=40
# Programs associative array
declare -A programs_array
programs_array=(
[hisat2]="${hisat2}" \
[hisat2_build]="${hisat2_build}" \
[hisat2_exons]="${hisat2_exons}" \
[hisat2_splice_sites]="${hisat2_splice_sites}" \
)
# Print formatting
export line="--------------------------------------------------------"
3 Identify exons
# Load bash variables into memory
source .bashvars
# Make directories, if they don't exist
mkdir --parents "${output_dir_top}"
# Create Hisat2 exons tab file
"${programs_array[hisat2_exons]}" \
"${transcripts_gtf}" \
> "${exons}"
head "${exons}"
Pocillopora_meandrina_HIv1___Sc0000000 10770 11116 +
Pocillopora_meandrina_HIv1___Sc0000000 12783 12874 +
Pocillopora_meandrina_HIv1___Sc0000000 13539 13642 +
Pocillopora_meandrina_HIv1___Sc0000000 14319 14391 +
Pocillopora_meandrina_HIv1___Sc0000000 14676 14751 +
Pocillopora_meandrina_HIv1___Sc0000000 15025 15130 +
Pocillopora_meandrina_HIv1___Sc0000000 15459 15574 +
Pocillopora_meandrina_HIv1___Sc0000000 17212 17286 +
Pocillopora_meandrina_HIv1___Sc0000000 17877 17994 +
Pocillopora_meandrina_HIv1___Sc0000000 18563 18696 +
4 Identify splice sites
# Load bash variables into memory
source .bashvars
# Create Hisat2 splice sites tab file
"${programs_array[hisat2_splice_sites]}" \
"${transcripts_gtf}" \
> "${splice_sites}"
head "${splice_sites}"
Pocillopora_meandrina_HIv1___Sc0000000 11116 12783 +
Pocillopora_meandrina_HIv1___Sc0000000 12874 13539 +
Pocillopora_meandrina_HIv1___Sc0000000 13642 14319 +
Pocillopora_meandrina_HIv1___Sc0000000 14391 14676 +
Pocillopora_meandrina_HIv1___Sc0000000 14751 15025 +
Pocillopora_meandrina_HIv1___Sc0000000 15130 15459 +
Pocillopora_meandrina_HIv1___Sc0000000 15574 17212 +
Pocillopora_meandrina_HIv1___Sc0000000 17286 17877 +
Pocillopora_meandrina_HIv1___Sc0000000 17994 18563 +
Pocillopora_meandrina_HIv1___Sc0000000 18696 19169 +
5 Build HISAT2 genome index
# Load bash variables into memory
source .bashvars
# Change to working directory
cd "${output_dir_top}"
# Build Hisat2 reference index using splice sites and exons
"${programs_array[hisat2_build]}" \
"${genome_fasta}" \
"${genome_index_name}" \
"${exons}" \
--exon "${splice_sites}" \
--ss "${threads}" \
-p > "${genome_index_name}"-hisat2_build.err
2
ls -lh
total 773M
-rw-r--r-- 1 sam sam 227M Mar 6 17:18 Pocillopora_meandrina_HIv1.assembly.1.ht2
-rw-r--r-- 1 sam sam 91M Mar 6 17:18 Pocillopora_meandrina_HIv1.assembly.2.ht2
-rw-r--r-- 1 sam sam 1.9K Mar 6 17:12 Pocillopora_meandrina_HIv1.assembly.3.ht2
-rw-r--r-- 1 sam sam 90M Mar 6 17:12 Pocillopora_meandrina_HIv1.assembly.4.ht2
-rw-r--r-- 1 sam sam 245M Mar 6 17:19 Pocillopora_meandrina_HIv1.assembly.5.ht2
-rw-r--r-- 1 sam sam 92M Mar 6 17:19 Pocillopora_meandrina_HIv1.assembly.6.ht2
-rw-r--r-- 1 sam sam 7.4M Mar 6 17:12 Pocillopora_meandrina_HIv1.assembly.7.ht2
-rw-r--r-- 1 sam sam 1.5M Mar 6 17:12 Pocillopora_meandrina_HIv1.assembly.8.ht2
-rw-r--r-- 1 sam sam 15K Mar 6 17:19 Pocillopora_meandrina_HIv1.assembly-hisat2_build.err
-rw-r--r-- 1 sam sam 12M Mar 6 17:12 Pocillopora_meandrina_HIv1.assembly_hisat2_exons.tab
-rw-r--r-- 1 sam sam 9.7M Mar 6 17:12 Pocillopora_meandrina_HIv1.assembly_hisat2_splice_sites.tab
6 List Hisat2 index files
# Load bash variables into memory
source .bashvars
for index in "${output_dir_top}"/*.ht2
do
cp ${index} ${genome_dir}
done
ls -lh "${output_dir_top}"
total 773M
-rw-r--r-- 1 sam sam 227M Mar 6 17:18 Pocillopora_meandrina_HIv1.assembly.1.ht2
-rw-r--r-- 1 sam sam 91M Mar 6 17:18 Pocillopora_meandrina_HIv1.assembly.2.ht2
-rw-r--r-- 1 sam sam 1.9K Mar 6 17:12 Pocillopora_meandrina_HIv1.assembly.3.ht2
-rw-r--r-- 1 sam sam 90M Mar 6 17:12 Pocillopora_meandrina_HIv1.assembly.4.ht2
-rw-r--r-- 1 sam sam 245M Mar 6 17:19 Pocillopora_meandrina_HIv1.assembly.5.ht2
-rw-r--r-- 1 sam sam 92M Mar 6 17:19 Pocillopora_meandrina_HIv1.assembly.6.ht2
-rw-r--r-- 1 sam sam 7.4M Mar 6 17:12 Pocillopora_meandrina_HIv1.assembly.7.ht2
-rw-r--r-- 1 sam sam 1.5M Mar 6 17:12 Pocillopora_meandrina_HIv1.assembly.8.ht2
-rw-r--r-- 1 sam sam 15K Mar 6 17:19 Pocillopora_meandrina_HIv1.assembly-hisat2_build.err
-rw-r--r-- 1 sam sam 12M Mar 6 17:12 Pocillopora_meandrina_HIv1.assembly_hisat2_exons.tab
-rw-r--r-- 1 sam sam 9.7M Mar 6 17:12 Pocillopora_meandrina_HIv1.assembly_hisat2_splice_sites.tab