This notebook entry is knitted from urol-e5/timeseries_molecular/D-Apul/code/02.00-D-Apul-RNAseq-gff-to-gtf.Rmd (GitHub), commit cf654b9.
This notebook will utilize gffread (Pertea and Pertea 2020) to create an A.pulchras GTF file from the A.pulchra genome GFF, which is needed for downstream analysis with HISAT2; specifically for identification of exons and splice sites.
1 Create a Bash variables file
This allows usage of Bash variables across R Markdown chunks.
{
echo "#### Assign Variables ####"
echo ""
echo "# Data directories"
echo 'export timeseries_dir=/home/shared/8TB_HDD_01/sam/gitrepos/urol-e5/timeseries_molecular'
echo 'export genome_dir="${timeseries_dir}/D-Apul/data"'
echo 'export output_dir_top=${timeseries_dir}/D-Apul/output/02.00-D-Apul-RNAseq-gff-to-gtf'
echo ""
echo "# Input/output files"
echo 'export genome_gff="${genome_dir}/Apulchra-genome.gff"'
echo 'export transcripts_gtf="Apulchra-genome.gtf"'
echo "# Paths to programs"
echo 'export programs_dir="/home/shared"'
echo 'export gffread="${programs_dir}/gffread-0.12.7.Linux_x86_64/gffread"'
echo "# Set number of CPUs to use"
echo 'export threads=40'
echo ""
echo "# Programs associative array"
echo "declare -A programs_array"
echo "programs_array=("
echo '[gffread]="${gffread}" \'
echo ")"
echo ""
echo "# Print formatting"
echo 'export line="--------------------------------------------------------"'
echo ""
} > .bashvars
cat .bashvars#### Assign Variables ####
# Data directories
export timeseries_dir=/home/shared/8TB_HDD_01/sam/gitrepos/urol-e5/timeseries_molecular
export genome_dir="${timeseries_dir}/D-Apul/data"
export output_dir_top=${timeseries_dir}/D-Apul/output/02.00-D-Apul-RNAseq-gff-to-gtf
# Input/output files
export genome_gff="${genome_dir}/Apulchra-genome.gff"
export transcripts_gtf="Apulchra-genome.gtf"
# Paths to programs
export programs_dir="/home/shared"
export gffread="${programs_dir}/gffread-0.12.7.Linux_x86_64/gffread"
# Set number of CPUs to use
export threads=40
# Programs associative array
declare -A programs_array
programs_array=(
[gffread]="${gffread}" \
)
# Print formatting
export line="--------------------------------------------------------"
2 Preview and Validate Genome GFF
2.1 Inspect GFF
# Load bash variables into memory
source .bashvars
# Make directories, if they don't exist
mkdir --parents "${output_dir_top}"
head -n 20 "${genome_gff}"##gff-version 3
ntLink_0 funannotate gene 1105 7056 . + . ID=FUN_000001;
ntLink_0 funannotate mRNA 1105 7056 . + . ID=FUN_000001-T1;Parent=FUN_000001;product=hypothetical protein;
ntLink_0 funannotate exon 1105 1188 . + . ID=FUN_000001-T1.exon1;Parent=FUN_000001-T1;
ntLink_0 funannotate exon 1861 1941 . + . ID=FUN_000001-T1.exon2;Parent=FUN_000001-T1;
ntLink_0 funannotate exon 2762 2839 . + . ID=FUN_000001-T1.exon3;Parent=FUN_000001-T1;
ntLink_0 funannotate exon 5044 7056 . + . ID=FUN_000001-T1.exon4;Parent=FUN_000001-T1;
ntLink_0 funannotate CDS 1105 1188 . + 0 ID=FUN_000001-T1.cds;Parent=FUN_000001-T1;
ntLink_0 funannotate CDS 1861 1941 . + 0 ID=FUN_000001-T1.cds;Parent=FUN_000001-T1;
ntLink_0 funannotate CDS 2762 2839 . + 0 ID=FUN_000001-T1.cds;Parent=FUN_000001-T1;
ntLink_0 funannotate CDS 5044 7056 . + 0 ID=FUN_000001-T1.cds;Parent=FUN_000001-T1;
ntLink_0 funannotate gene 10215 15286 . + . ID=FUN_000002;
ntLink_0 funannotate mRNA 10215 15286 . + . ID=FUN_000002-T1;Parent=FUN_000002;product=hypothetical protein;
ntLink_0 funannotate exon 10215 10413 . + . ID=FUN_000002-T1.exon1;Parent=FUN_000002-T1;
ntLink_0 funannotate exon 10614 10676 . + . ID=FUN_000002-T1.exon2;Parent=FUN_000002-T1;
ntLink_0 funannotate exon 11272 11316 . + . ID=FUN_000002-T1.exon3;Parent=FUN_000002-T1;
ntLink_0 funannotate exon 11518 11591 . + . ID=FUN_000002-T1.exon4;Parent=FUN_000002-T1;
ntLink_0 funannotate exon 12241 12501 . + . ID=FUN_000002-T1.exon5;Parent=FUN_000002-T1;
ntLink_0 funannotate exon 13074 14383 . + . ID=FUN_000002-T1.exon6;Parent=FUN_000002-T1;
ntLink_0 funannotate exon 14722 14900 . + . ID=FUN_000002-T1.exon7;Parent=FUN_000002-T1;
2.2 Valdiate GFF
This identifies if there are rows with >9 fields (which there shouldn’t be in a GFF3).
Additionally, it provides a preview of all rows lengths identified.
# Load bash variables into memory
source .bashvars
# Capture number of fields (NF) in each row in array.
field_count_array=($(awk -F "\t" '{print NF}' "${genome_gff}" | sort --unique))
# Check array contents
echo "List of number of fields in ${genome_gff}:"
echo ""
for field_count in "${field_count_array[@]}"
do
echo "${field_count}"
done
echo ""
echo "${line}"
echo ""
# Preview of each line "type" with a given number of fields
# Check array contents
echo ""
for field_count in "${field_count_array[@]}"
do
echo "Preview of lines with ${field_count} field(s):"
echo ""
awk \
-v field_count="${field_count}" \
-F "\t" \
'NF == field_count' \
"${genome_gff}" \
| head
echo ""
echo "${line}"
echo ""
doneList of number of fields in /home/shared/8TB_HDD_01/sam/gitrepos/urol-e5/timeseries_molecular/D-Apul/data/Apulchra-genome.gff:
1
9
--------------------------------------------------------
Preview of lines with 1 field(s):
##gff-version 3
--------------------------------------------------------
Preview of lines with 9 field(s):
ntLink_0 funannotate gene 1105 7056 . + . ID=FUN_000001;
ntLink_0 funannotate mRNA 1105 7056 . + . ID=FUN_000001-T1;Parent=FUN_000001;product=hypothetical protein;
ntLink_0 funannotate exon 1105 1188 . + . ID=FUN_000001-T1.exon1;Parent=FUN_000001-T1;
ntLink_0 funannotate exon 1861 1941 . + . ID=FUN_000001-T1.exon2;Parent=FUN_000001-T1;
ntLink_0 funannotate exon 2762 2839 . + . ID=FUN_000001-T1.exon3;Parent=FUN_000001-T1;
ntLink_0 funannotate exon 5044 7056 . + . ID=FUN_000001-T1.exon4;Parent=FUN_000001-T1;
ntLink_0 funannotate CDS 1105 1188 . + 0 ID=FUN_000001-T1.cds;Parent=FUN_000001-T1;
ntLink_0 funannotate CDS 1861 1941 . + 0 ID=FUN_000001-T1.cds;Parent=FUN_000001-T1;
ntLink_0 funannotate CDS 2762 2839 . + 0 ID=FUN_000001-T1.cds;Parent=FUN_000001-T1;
ntLink_0 funannotate CDS 5044 7056 . + 0 ID=FUN_000001-T1.cds;Parent=FUN_000001-T1;
--------------------------------------------------------
Great! This looks like a valid GFF. Can proceed with GTF generation.
3 Generate GTF
# Load bash variables into memory
source .bashvars
${programs_array[gffread]} -E \
"${genome_gff}" -T \
1> ${output_dir_top}/"${transcripts_gtf}" \
2> ${output_dir_top}/gffread-gff_to_gtf.stderr4 Inspect GTF
# Load bash variables into memory
source .bashvars
head ${output_dir_top}/"${transcripts_gtf}"ntLink_0 funannotate transcript 1105 7056 . + . transcript_id "FUN_000001-T1"; gene_id "FUN_000001"
ntLink_0 funannotate exon 1105 1188 . + . transcript_id "FUN_000001-T1"; gene_id "FUN_000001";
ntLink_0 funannotate exon 1861 1941 . + . transcript_id "FUN_000001-T1"; gene_id "FUN_000001";
ntLink_0 funannotate exon 2762 2839 . + . transcript_id "FUN_000001-T1"; gene_id "FUN_000001";
ntLink_0 funannotate exon 5044 7056 . + . transcript_id "FUN_000001-T1"; gene_id "FUN_000001";
ntLink_0 funannotate CDS 1105 1188 . + 0 transcript_id "FUN_000001-T1"; gene_id "FUN_000001";
ntLink_0 funannotate CDS 1861 1941 . + 0 transcript_id "FUN_000001-T1"; gene_id "FUN_000001";
ntLink_0 funannotate CDS 2762 2839 . + 0 transcript_id "FUN_000001-T1"; gene_id "FUN_000001";
ntLink_0 funannotate CDS 5044 7056 . + 0 transcript_id "FUN_000001-T1"; gene_id "FUN_000001";
ntLink_0 funannotate transcript 10215 15286 . + . transcript_id "FUN_000002-T1"; gene_id "FUN_000002"
4.1 Copy GTF to D-Apul/data
To help make this easier to locate, will copy to the D-Apul/data directory, which also contains the genome FastA, genome FastA index, and the genome GFF files.
# Load bash variables into memory
source .bashvars
cp ${output_dir_top}/"${transcripts_gtf}" "${genome_dir}"