INTRO
This notebook annotated expressed genes with gene ontologies, assigned them to GOslims, and generated corresponding counts of GOs in each GOslim category, as part of urol-e5/deep-dive-expression (GitHub repo).
The contents below are from markdown knitted from 30.00-Peve-transcriptome-GOslims.md
(commit 0b980cb
).
1 BACKGROUND
This notebook will perform annotation of expressed genes, as previously determined by 06.2-Peve-Hisat.qmd
(GitHub).
Briefly, the notebook will perform the following tasks:
Extract all genes from the genome, as GFF and FastA.
Create a subset of only expressed genes, based on gene count matrix.
BLASTx expressed genes against SwissProt database.
Get gene ontology.
Map gene ontology to GOslims and get counts.
Expressed genes were defined as those genes having at least one count across all samples.
1.1 INPUTS
Gene count matrix
Genome FastA
Genome GFF
1.2 OUTPUTS
Genes BED
Genes FastA
Expressed genes FastA
Expressed genes SwissProt IDs only file.
Expressed genes to SwissProt IDs mapping file.
Expressed genes to SwissProtIDs and GO mapping file.
Counts file of expressed genes GOslims.
1.3 SOFTWARE
DIAMOND BLAST (buchfink2021?)
bedtools (quinlan2010?)
samtools (danecek2021?)
Biostrings (h.pagès2017?) (Bioconductor R package)
GO.db (carlson2017?) (Bioconductor R package)
GSEABase (martinmorgan2017?) (Bioconductor R package)
2 VARIABLES
# DIRECTORIES
<- file.path("..", "output")
top_output_dir <- file.path(top_output_dir, "30.00-Peve-transcriptome-GOslims")
output_dir <- file.path("..", "data")
data_dir
# PROGRAMS
<- file.path("..", "..", "M-multi-species", "data", "blastdbs")
blastdbs_dir <- file.path("", "home", "shared")
programs_dir <- file.path(programs_dir, "bedtools-v2.30.0", "bin")
bedtools_dir <- file.path(programs_dir, "ncbi-blast-2.15.0+", "bin")
blast_dir <- file.path(programs_dir, "diamond-2.1.8")
diamond <- file.path(bedtools_dir, "fastaFromBed")
fastaFromBed <- file.path(programs_dir, "samtools-1.12")
samtools_dir <- file.path(samtools_dir, "samtools")
samtools
# FILES
<- file.path(top_output_dir, "06.2-Peve-Hisat/gene_count_matrix.csv")
count_matrix <- "20250620-diamond"
diamond_db <- file.path(output_dir, "Pevermanni-expressed-genes.blastx.outfmt6")
diamond_output <- file.path(data_dir, "Porites_evermanni_v1.fa")
genome_fasta <- file.path(output_dir, "Pevermanni-genes.fasta")
genes_fasta <- file.path(output_dir, "Pevermanni-genes.fasta.fai")
genes_fasta_index <- file.path(output_dir, "Pevermanni-subset-genes.fasta")
genes_subset_fasta <- file.path(output_dir, "Pevermanni-subset-genes.fasta.fai")
genes_subset_fasta_index <- file.path(output_dir, "Pevermanni-genes.bed")
genes_bed <- file.path(data_dir, "Porites_evermanni_validated.gff3")
og_genome_gff
# THREADS
<- "40"
threads
##### Official GO info - no need to change #####
<- "goslim_generic.obo"
goslims_obo <- "http://current.geneontology.org/ontology/subsets/goslim_generic.obo"
goslims_url
# FORMATTING
<- "-----------------------------------------------"
line
# Export these as environment variables for bash chunks.
Sys.setenv(
blastdbs_dir = blastdbs_dir,
count_matrix = count_matrix,
data_dir = data_dir,
diamond = diamond,
diamond_db = diamond_db,
diamond_output = diamond_output,
fastaFromBed = fastaFromBed,
genes_bed = genes_bed,
genes_fasta = genes_fasta,
genes_fasta_index = genes_fasta_index,
genes_subset_fasta = genes_subset_fasta,
genes_subset_fasta_index = genes_subset_fasta_index,
genome_fasta = genome_fasta,
og_genome_gff = og_genome_gff,
output_dir = output_dir,
top_output_dir = top_output_dir,
line = line,
samtools = samtools,
threads = threads
)
3 Extract genes as FastA
3.1 Extract genes from GFF
# Read GFF, skipping comment lines
<- readr::read_tsv(
genome_gff
og_genome_gff,comment = "#",
col_names = c("seqid", "source", "type", "start", "end", "score", "strand", "phase", "attributes")
)
# Filter for gene features
<- as.data.frame(genome_gff) %>%
genes filter(type == "gene") %>%
mutate(
chrom = seqid,
start = start - 1, # BED is 0-based
end = end,
gene_id = sub("ID=([^;]+);?.*", "\\1", attributes)
%>%
) ::select(chrom, start, end, gene_id)
dplyr
str(genes)
'data.frame': 40389 obs. of 4 variables:
$ chrom : chr "Porites_evermani_scaffold_1" "Porites_evermani_scaffold_1" "Porites_evermani_scaffold_1" "Porites_evermani_scaffold_1" ...
$ start : num 3106 8491 26133 30312 32615 ...
$ end : num 4488 11026 26404 42225 67628 ...
$ gene_id: chr "gene-Peve_00000001" "gene-Peve_00000039" "gene-Peve_00000112" "gene-Peve_00000117" ...
3.1.1 Write genes to BED
# Write to BED
write_tsv(genes, genes_bed, col_names = FALSE)
3.2 Create genes FastA
"${fastaFromBed}" -fi "${genome_fasta}" -bed "${genes_bed}" -nameOnly > "${genes_fasta}"
# Create FastA index
"${samtools}" faidx "${genes_fasta}"
head "${genes_fasta}"
echo ""
echo ""
head "${genes_fasta_index}"
>gene-Peve_00000001
TTACTGCTTCAGTATGTGAATTTCGATGGTGGCTTGACCGGAGTTAGACATGGCCCCCCTTGCTCGGAGTCCCGATCCCAAATCTCTCTCGTGCCAGTAGTTATCTCCTTTGAAGGGATTATCGTAATACATCCGGTACCAAAGATTGTAGTTGGCGCGTCTTTTACCACTGTAAAGCCAAACATCAAACCAGTTGCTGTACCAGTTGTAGTCAAATGGAACGGAATACATAACAGCAAGTGTTTTATCGATGTGTGGGATGTAGTATGTTAATACTCCTACTGCGCCTCTCGCAACGGGCCCCGCAGTTTTTCGTGCGCCGTAAAGCAAAGCTGTGCCTAGAAATAAGCAAACAAATTAGCAATTAAACAAAAATCATAAACATGATCATTCGTCACAATGATACATGGCAAACNTTGTGTCAGGTTAGTGATTGCTTATCTTTTCCCCAAATATTACACGCAAAGGGATTCTCATTCTGAACATGAATACTAAGTTTAGTTGGTTTAAGAAGCTCTTTCAACCCTGAAGTCATTTTGCCGTTCTGACAAAGCAAAACATAAACCTGTTGTCTGCAAAATTTCAAGAATGGGCAAATCCTCTAATACAGTTTCTTTACTCTTTAGGGTAAGTGGTCGCTGAGTTCGTCCTTGGTTTCATACGGTAATAGGCCCTTTGCACGCGATGAACTCATTTTACTACTACTACTACTACCAGAATCACTCGGGGTTTTGCTTTCTTGGACAAACGAGGGCTTTATTTTTCAAACCTCGCATAACCTGTGATCAGGCCCCCAAAACAAAAAGGGAAGAAGGACCGCCTGATTGCAGGTTAAATTCTTCGCAGAGATTAGCTAGAAAAATTAAATATGACAAGAAAACCAAAATGGATTCTATAGCAATAGATCAGACCTTTTTAGCTTGTATGGTTTGTTTACCCATTTCAGACCAAGTGATGGTACCCCAGAGAACTGTTTCTTTCAATGTCGTCTTTTGCACGTGCACCCACATGTTATGTATGCATAATTATATAAAAGACAAATTCCCCCAAGAACATCACGTGGTCTGAATTGGGAAAACAAAACAAACAAGCTAATGACACCATCCTGCATACTGGCCTTAGTGTTATTACTAAAAGAAAGGATTTTAAACTTTTATGTTATTAATATTAATTTTACCTGAAGAAACATCATGAGGCAAAACACGATTTGACGTCCCTGAATAAAAATATATGTTGACTGCTCTCCATTTATATCCACTTTCGTTATCAACACCAATGGCGACCTTGCGGCTTATGCTACCAAGGGTGTTTAAAATTGTTGTGAGAATGCCCAAGCCAAGTTGAGCACCGCTGATGACAGCACCAGCGTCAGCTAAAATTTT
>gene-Peve_00000039
ATGGGCATCGTTAGGCTGTTCACCCTTCTGCTTTTCGTTCTGCTGATAACTCAGTCAATCTTTGGGTCTCCCTTCAAGCAAGACAATCATGTCGAAAGGAGTCCTGGATCGTCCTCTAAACACACTGAGGTAAAACAGTTCTCTTTAAAATAAGATAGTCAATTGGTGTCTTGTGTTTAAGACGAACATGAAAGTGTCTCTTGGGGGTGGCGTGATATCTACGGAAATGCAACTCACTGTTCTAATCGGATCTCGGTTGTTTCACTAAAACAAAATGAACCCTAATCTCGGACCGAATATAAAACCGACTAACACCCCCCCCCCCCCCCTCCAGTTCAGAATTCTAATCTCGACACTCGCTTTCCCTTTTCTCCAAAACACCGATTACCCTTGTGAGTGTGTAGTGCTGCGAGTAGATACTAGGTAGTTTTACACCTTAACGCTTGAATGTATAGTAAACATTTTAATAATTTTAAGCCGTAAATCTATTGAGTTCTTTGTGATGTGTCCCTCAATAATCAATGTTCAGTTTGGCGAAGTGTAAACTGAATGAATACACAGCCATCATGCAATTCTTGGTGGTTTCGCTAAGGCTCCCAGCGATAATTTTAGCCGGCTTCCGATTTCATTTCTGTGCCTCAAATGTATTTAAATACACAGAGGAAAAATTAGTGAAAAAAAAAGAATACTCATTCCAGTTTGTATTCAGTGGAATGACCTTAATGGCCATACTTTCCTTCACAGATACAGGGACGACAAGGCTCGTCCACGATCTCTACTCGTAAAGAAGGCGACGAGAAAAACAAGCGAAAAGGTAATTGCATCAGTTGTTTAGAAGTTGCGTACCATAACCTTTGAAAACAAAAGGAAGAAACAAAAAGAAGCTGAAACGTAACTGCACAGGCTAGCTGTTTGGTAAACCAAACTATTCATTACAGTTGTTGGGTTACTCTGCTAAACGAAAACATTCATTAGCCGACTGTTTATTCAAACTTGGTTTGTTTTAATACCACTAAGTGTGCAACTAAAAATTTTATCTTGCTGTGTTCTTTAAAATTTTAGCTGACGCTGGTGCTGTCATCAGCGGTGCTCAACTTGGTTTGGGCATTCTCACAACAATTTTAAACACCCTTGGTAGCATAAGCCGCAAGGTCGCCATTGGTGTTGATAACGAAAGTGGATATAAATGGAGAGCAATCAACATATATTTTTATTCAGGGACGTCAAATCGTGTTTTGCCTCATGATGTTTCTTCAGGTAAAATTAATATTAATAACATAAAAGTTTAAAATCCTTTCTTTTAGTAATGACACTGAGGCCCGTATGCAGGATGATGTCATTAGCTTGTTTGATTTGTTTTCCCAACGTTCTTCGGGGAATTTGTCTTTTATATAATTATGCATACATATACACGTGGGTGCACATGCAAAAGATCAGACGACATTGAAAGAAACAGTTCTCTGGGGTACTATCACCTGGTCTGAAATGGGAAAACAAAACGTACCAGCTAAAAAGGTCTGATCTATTAGTATAGAATCCATTTCGGTTTTCTTGTCATATTTAATTTTTCTAGCTAATCTCTGCGAAGAGTTTAACCTGTAATCAGGCGGTCCTTCTTCCCTTTTTGTTTTTGGGGGCCTGACCGCAGGTTATGCCAGGTTTGAAAAGTAAAGCCCTCGTTGGTCCAAGAAAGCAAAACCCCGAGTGATTCTGGTAGTAGTAGTAGTAGTGAAATGAGCTCATCGCGTGCAAAGGGCCTATTACTGTATGAAACCAGGGGCGAACTCAGCGACCACTTACCCTAAAGAGGAAAGAAACTGTATCAGAGGATTTGCCCCTTCTTGAAATTTTGCTGACAACAGGTTTACGTTTTGCTTTGTCAGAACGGCAAAATGACTTCAGGGTTGAAAGAGCTTCTTAAACCAACCAAACGTAGTATTCTGAGTGTGTGATGTTCAGAATGAACATCCCTTTGCGTGTAATATTTGGGGAAAAGATAAGCAATCACTAACCTGACACAAATAAAATTTCTGGGTGGCAACAAAGGAGGACATGTTAATGATCTGACACATTCTATGTAAATTTTGTTTTTGGGTACACGACTCTATTCTGTGATGCTTGTTTGCCATGTATTATTGTGACGAATGATCATGTTTATGATTTTTGTTTAATTGCTAATTTGTTTGCTTATTTCTAGGCACAGCTTTGCTTTACGGCGCACGAAAAACTGCGGGGCCCGTTGCGAGAGGCGCAGTAGGAGTATTAACATACTACATCCCACACATCGATAAAACACTTGCTGTTATGTATTCCGTTCCATTTGACTACAACTGGTACAGCAACTGGTTTGATGTTTGGCTTTACAGTGGTAAAAGACGCGCCAACTACAATCTTTGGTACCGGATGTATTACGATAATCCCTTCAAAGGAGATAACTACTGGCACGAGAGAGATTTGGGATCGGGACTCCGAGCAAGGGGGGCCATGTCTAACTCCGGTCAAGCCACCATCGAAATTCACATACTGAAGCAGTAA
>gene-Peve_00000112
TTCTGCTCTCTATGTTTAGGTCAGCCTTATCAGCGTGGTCTCACAGCTATTCTTTGCACGATAGGCAGTGATAGCATCACTAAGTCTGTCATCCATAAACTTACTGTCTTGTTGGCCCAACAGTACTTCATAAACTTTATTCACGCATGGCAACACCGTGATAGGTCTATAGTTTATATCCCATTGTTTATCCAGTTTATCATCCTTCTTATGTACTGGTGTCCATTCCCCACGCTTAAGCTTTGCTATCCATTCTCCATTTCTAATCGCT
>gene-Peve_00000117
AACTTTCTTGCTGCAGTGAAGGACCTGAAGATACTAAAAACTCTGAGGTGAGTACCCCAGTAATGATTGTATTTCGTACGAAAAAAGTTAACAGGTCTTTGGCTAAAGAAAATAGATTTCTAAAATAATATTCTTGTTTTTAATAACGAAATTTCCACTGAAAGCACTCATTTTGCGACAGGTTTTAGATCCTTTGAGCTTTATGTTTAGCTCAGTGTTGCACACTTTTATGTTCGATTCGATTTTATGTTCAATAAACAGCCTGTCCGGCACTTTAAAAAAAAACTGAATAAGACTTGCTAAAATTTCTTTCATGTTTAAACAATCAATTGAAACTCTTAGGACTATTCTTGTTTTGCCTTCATTAAGCCTGTTCAATTCACTTGGCTGTTCATCATGTCAAAACCAAATTTTGGCGTTACTTTATCACCTATGCAGCTCAACCTTTCCTTGGTTTTTCTTTCGATTAAGTTTCTCGTTCTTTACCTAAAGCTGAAAAATCGAAAAAACCTCCAAACCAGTGCAACACTGACAACTTTTCCTGAATTTTTAATTGAGGAATAAAGCTTTTAATTAACTCCAACGTCAGCTTAGTAAGGTGCTTCCTTTTCAGACACAATCGTGAGGGAGAGGAATAATTTTGGGGAAGTTTCCATTAATAACTGTTCGCCAGAAAATTACTTTAAGATAAAAACAACTGAGTTCATGTTTTTTAAGACTATAATTTTAATGGGAAACCTGTGTTGATGCGAGAGTTTCAAAATTTTTCGTATTTATGGCATGCTTTTTGCATCAACATTTTCGCAGTTTCAATAACCCTGTCAAACAATACACACTTTTTGTTTCTTTGGCAATGTTTGTAAACCTCCTGGTCTTCTCAAACTTAACTCGAGAAGGAAAATTAACCGGGTTATGTTTAATAAAGCTTTTGCTTCACGCAAAAATGAAATTTCGGCACCCCATGTTTTCACTTGAATTTAAGTAGTTACATAAGCTTAGCTAAACTTTCATTTCCGCAAATCAATAAAACAGCGTTAACGTTGTGCGGCAAACTTTGCCCCTAGAATTTCTATCAGTGCTTTTTCCTTTCTTGTATAAATATTTTAGGTTTTTAATTGCCTTTAAGATTCAGAATGGAGCACACTGTATTTGTCATCTTTGTTTTGATCCTTGGAATAACTGTCACCACCTTGGGCTCTCCGATGTCAGACGATCATAAAGACGGCGATCCTGAACTATCTCCAAGGCTTAAGGTATACAAAAACGTTTTTTAAGGAGTTATTTTGCTCGAAATTCCTGTGTGGAAGCTGCATTTTTTTTATTCATCAGAGACCTCTGGCACGGCCTCTGAGCAGTACTGTGGCACAGCTGTGTAGCCGTCTCGTTACCCTTTCAGCAGTTAGTTAAGCTAATTCAATGGTTACTTGTTGTTTTTTGTTTTTGTTTTTTATCAAAATGATAGAATTTTGTTTGATCAGTCTGTATCGCCCATAATGCACCACGAAAAGCATAGTCATTCTTTAACGACAGGAAAAGGGACATTGTTTTTGTTTCGCAAAACAAATAAAAAACGAAAAGTGTGTTTTAACTGTTATTTACCGTTCTCTGAGCCTTCCCCAAATTCCAAAGTTTAGATAAGTGATTCTGCACAATAATTGTTCTAGCTGGTCTAGTGTAGCCGGAAACACTGGGAAAGTTAAATTAATTGGTGCTGTTGCCATTTCATTATTTTGTTTTGCATCTGGCTTGTTCATTTAAACTTTCCGCTAATTCTTATTTTTCCACACTGGTCATCCCATTCAGAAGTATCTCAAATAGGCTATTTTTCAAGGAAGGAGAAGGACAACTGGGGAAATTTAACTTAGAAATGACAAAAGAATTAGGTTAAAAACATGTATATTGACTTTAATTAAAAACCACAAATGTTTCTGTTTGATATAAAGAAAATTGAAGGACTGAGAATGCGTTCAAATCAATCAAAAGTCGCATTGGCAAAGGATGCTGAAAAATCAACAAAAGTCCAGGTGAAGTTATTGTTAGCCGTTCTATTTCTCATTTAATCATACGAGTGATTAACAAAATCGGACGACCGCGTAGCGGGAGTCCGATTTGTTTAATCACAAGTATGATTACAGACCGAATTGGATGACACGAAGTTCTGTTACCAATTAATCATAACTTTAACAAAATTTGTGATATATAGGGCTCTTTTTAAAATCAAAACCAAAGAAATTCCAAGATTTTTTCGCTAGCAATGAAAAAAAAGCCATTTAAGAGCGCGCGTGATGGCGCACACTGTCCAATTACTTATGCATGACGCGTACTGTCCTATTAAACTGTCCTATTAAGGCTGAAATCAGGGCAGTTGAGAACCAATCAGATTTGAGAATTTTGTTATAGTTATGATTACAGACCGAATTGGACTCCACTCAGTCCTGTTACCATTACATACTACTGAGCATATACACACGGGAAAACGCGATACTTATCTTAAACGAAAATTCACGGGTAGCCTGCGAGGCATGTTAATACTATCGATGCATTGGTACCGGAGAAAGAGAAATTCAAGTTAGTTCCAAGGGAAAATATTTCACCCTTCAATGTCAGCGTCTTTGGTTGTCGCCAGCACTACCACTATGGTATAAAGAGAGATCGTATAAAAAAAATAAACTTTGCTTGAACATATTTATCTCGACAGGAACGAAAAGAGCAAAGGACCAAGCGTCAAGAACCAGGTCAGTAGTTCATGTATGACCCCATGAAATGTCCTGAGACACCAGTGCCGTGTCGGGTCTCTAGGCACAGAAGCAAGTGTCCAAGAACCAACACGTAATATACCTTTCATTTCCTTCTTTATTGTGTAACATAGGTCAGGTGTTCATACGTTAATTAATTATTTATTTATTTATTGAGAAGCAATAACAGACCCCATGCAAAGGTCTGCTCCCAAAATACAAAAGAGATCCTCAAAATTTTTTTCGAAATTGTATGGCTTAAGACATCTCGAAGAGTCTTTCGTGTTGGAGCCCAGTATGGACCCAGTGGAGGCTGCGAGTTTATATATTATAAATTTGCCGCAAGTGGAAGGTATCTTGAACATATGCCATGCATTTGTAGGGACCGCGGAGCAGGGTTTGGAGTTGTGGGGGGAGGGGGCAGTTAATCTGAACCCGTAAGGGTGAGCCTACTAGGGGATACAGGGGCATGTTTCCCCTGTAAAATTTGATTTTAGGAGTCCTGAAACGGCTAGAAAAGCACCTAAAACTATTATACTTAAATAAAAGTAACGCAAGGCCAAATTTTTAACGATCATATCGCTGCTAAAGTCTGTGAAAGTTGTAACCGAGCTGGTGCTCCTTCAACTGGAAAAGGCAAGATAATTAAAAGGTATAATCGTACCAGTCTCACAATATTTAGGCAGTAAAATAAATCCAACTCCATCAATGACTTGTGCCTTGTTCGAAGTTGTTGCGGATTGGCCGCTTCATCTTCTGACAACTGAGTATCTGTCATTATTGGCTCAAACCTACCACACCACGCTGTTGAATATTTTTAAAAGACGTTTTCTGAACGTTCTTCCCTTTTTATCCCTACCAAAGCCTCTCGTCAGGTTTCGATCAGGTTCCTCATTGCGTGTTTCCAACTTGAAGTTTAAAACAGCATTGATAGAAGAAAGAAGATTCAATAACAAAACCTTCGTAACGTCCCAGTGTAACAAAACCAAACATTCAGGCCTCCTGTTGTTCAATAGGTGGTTAGCGCTACTCTCTCCTCCGCAGAGGCGTCTTTGTGTCGTAGGGAGGCTGGGGAGAAAGAAATAGAAAGCGCGCGGGGCACGGTGGGAAGGGGAGAGAGAGAAAAGAGGCTCCCGTCTTTTCCCTCTTCCCATCTCCCCCCCCCCTCCCCCCCTCGCACTTTTTATTTTTCGATTATTGCTATTTTTATTGTGATACCCAGCAAGAGCCTCTGCGGAGGAGAGAGTTAGCGCTAGCCGACGGATAAATCACTATCCAGTGGATAAGTATTAGGGGTGAATCATTTGAGCTATCCGCTAGGTAAAGATTTATCCGAGCGCTATCCACCTTTTGAACTACTGAAGCTGGCGGTAAAACATTGATCCAAAGGCAAATTAATTCTGGAAAGAAACAGAAGTTGTTAACTCCATTATTTTACCGTCAAAAATCCAAGCGTCCTTTCAGCAACAATAAGCCTTATCACGGTTGTCGACGACATCTTTTGAAAAGTGGGCAACGGCGTGAGAAATTACAGTGTTTGTATGAGAATCTAATATCGAATAATTTCCCTAAAACGGCGGTTCTGAAAAAGATATGAAGTGTAAACTAAAGCTTTGACTCCTAACGATTCTACTGAAAAAATGTAAGGGCGTAACATGCAGTTACATTCGCTAGAACCACTTTATAAAATTTTCTATACATTTGTCTGTAAAAATTTCTCTTCCGACTAAAATTTTCCGGGAAAAATTGCAGACACCGTGAATATATGGAAAACCTAAAGCAAGCGTTCTGGAGAAATTGAAGCACAGTAACAGCCCCCAAAATTTGTTAGTAAAATCCTTTCCTGTGTAGCTTTATTTAATTCATATATCTTTTTTCAGAACACCCGTTTTACGGAAATTATTCGACATTAGATTCTTCTACAATCACTAATTTCTCACGCGGTTGCCTACTCGTCAAAATGGCGTCGAAAACCGTGATGAGGCCTGGAAGGAACAAAAAGAAAAAACAATAGTATTGCAAAGTTTACAAACCAGAGACCAGATTCCGGACAGTTATGTGTTATCACAGCATGGAATTTTTGTCACTGCAGGCGCAGACGTCCCTCCTGGCGAGACGTCCCTTGCGGCGAGGAGCGAGGAGGGAAGGCTGTATTCGCAGGCTAGCGATTGTGGTGATCGTATAGAAAAAATGCACACCCCTAAACGTTTTTGTTTGGCTTTTTTCACACATAGCTGACATAATCGCAGCGGCTGCACTAACTGTTGGCGTACTTCAAGGGATCTTGGATGGCATCATAGGTGTAGATCGCAAGATAGCCATCGGCATCAAGAATAGAAGCGGATATAGCTGGGGAGCAATTGATATTTACTTGCAACAAGGAGTTACAGATAGGGTCATTCCCCGTAGAGTCGATAATGGTAACTATGAATGTCACGAATTTAGCATGAAAGCGCTTATCGGGGACACTGTTATAATGTCTCCTTTAGGAGACGGGGTGGCCATTTCTACGTAGTCTTCGCCCAATTTGATTAATCACTTTTTGTGCTGGAAGAGAAACAAGTCAGTACGTGCCCCTCCTTCAAATTTTTTTCCATTACTTTAAGTTTTTGTTATGGTTTTAGCTTCTCCCTCTTCTTCTTACTTAGGATTTGAGCGTAACCCATTAAGGTAGCTCAAGCTCGAAATATAAATTTGACCAGAGCGCCGTTGTGTCGAAATATGAGGATTTGTGTGGGAAAACTGATATAGCTTTTTAAGGGGGAAATTCGGCTGTTTGTTTTAGGGGGGACATGGGGCTGATTTGAGAAGGAAAGAAATAAGGGAAGAAAAGGTATACACATAAATGACCAGTTAGTTTGGCATTTGTCTACTAATTATTCTCTTCATATTTTAATTATTAACCTCCCCAAATTCAATTTTCCATTATTTTAAACATTTTCCCCCAAATTTTATTCATCAAAACCATCCAAAAACAGTCGCACATTGAGCCAACGACAACGACCTTATTGAGCCTTTTATTATTCAATGAAGTCTTGTCTAGATTTGGAAGAAAAAGAGCAGAAATAACGAAAAACAATGCTGTTGAGTAGAATTCCCCCTTTACTAAAACTGTAAACCAGTATTGAACAAAAACTTAACCAGGTCTAGTTCTCTTTTTGTTTGCCATTTTGCCTGATGTGTGGTTTATTGGTGGCCTTTGGTCCACTCCGAGTTTTGGTTTATATACCAACCCTTTAAAATTAAAGACCCAAAGGTCAGGGGCACCCAACGAGAATATAGTTCAAAACCACTTAACATAGCATTTTTGAACGTAATTTAGTCTTTAAACTGTAGATATAAACATATTTTTATCCCCTAACAATTTTTCATCTGTTCGGATTTCCTAGGTGAAAATCTAGGGATCCGAAAATTATAGGGATCAAAACTTAACTTTTCGAAAATTTCAGTCAGAAAAAAGGCTCCCGAAAATTCTAGATGACCTTTTTAGGGTAAAAATCCGCTTAAAATGGGCAATTGTAGCATTTTTTACATGTTCGAAAATCCTAGGAGAGGCAGGCAAGCAAGAAATTTTATAACAAATGTTCCGAAACTTCTAGATCTCAAATCGTCTTCCGAACAGATATTTTCCGAAAATTGACGTTGGGTGCCCCTGAAAAGTCGATAAGAATTCACGCAGGACGAAAAGAAACAAAGAACTAAGCGAGGTAGAACCGAGGTAGGTTATTTCACAATGCAACACTGTTGCCCAAATGTTAACCATATCGGCTTTTCCCCTACAAAACTCCATATTTCGGCTCATATCTCGAGCTTGGGAAAAAACCTCTTCGACAAGTTATGATGTTGTTTTCATGATGTTGATGATAAGTTCATTCAATATCGTAAATGCTCTTGGTTTTTGGCCATCTCAGTTTTAAAAGGAATAACACACAAACTGTGGTTAAAAACATCAGAAATAACAAACGCATTGTGTAACTGACTTTTATAGAGGTCTTTATAATATCGAACTTGACGTTTCGTATCCTTCAACTGACATCTTCAGAATTGACTGTTAAGATGAGTAGTAACTACACTTAAATAAAAAAAAGAACAAAGTAAGATAAGTAACTAATTAACGTGAAGGAAAAAACAAATAAATGTGACAGATGAGAAAATTTCCATGAAATTATGCAAACTAGTTTAAGTAAAGCTTTTCGCTGCTTATATTTTCATTTAGGGCTGGTTTTTAAAATCCCTAATAAGGTTATTTTCCAAAACACGCAAAACACACATCGCATCAAGTCTCTTTTCCCCGACAAAGACAGGCTCAACCGTTCTCAAATGTCCAATGTTGTATATAAAGCTAGTTGTTGGGACTGTCTGGATTTCTACATTAGCAAAACCACACGGAGATTGCATGACCGAAAAACTGAACACTTCAAGGCAATCACCAGTATTAACGGTCATTTATCAGCTATTGCAGAACTCGTCACTTCAACTGTTTACAATTTGAAATGGCACCATTTTGACATTTTAGCAAGAGGAAAATCAAATACCGATTGTAAAATCAAGGAGGCTCTTCTTATTAGGGATTTAAAACGAGCCCTAAATGAAAATATAAGCAGCGAAAAGCTTTACTTATACTAGTTTGCATAATTTCATGCAAATTTTCTCATCTGTCACATTTACTTCTTTTTTCCTTCACGGTAATTAGTTACTTAACTTACTTTGTTTTTTTGATTCAAATTTAATTACTACTCATGTTAACGGTCAATTCTGAAGATGTCAGTTGAAGCATACGAAACGTCAAGTTCGATATTATAAAAGTCAGTTATGCAATGCGTTTGTTATCTCTGATGTTTTTAACAGCAGTTTGTGTGTTATTCCTTTTAAGTTTATTCAAGAGCCATAATGACCATTATGGCTCTTAGTTCATAATTTTACTTTAAAAAATAATGTAAAAAGTAAGGGAGAATGCCCGCAGGGTATAGAGGAGTTAAGTTGATTTTCTTTATGGGTACCTGCAGTTGCCTGCAAACCTCGCTTTAGGTATGTGACTTCCCAAAGGGTAGGATTTTTGCGCCGTTTTGGTTTGAGAACGGGTACACGTAGGGGTCAAACCACCCCCCAAATGTTTCATGAAACCTAATTTTGGTTATTTTCAACATTATTATACTAAACTTGTGGTGTTTGCTCAAATTGATGTCCACTTCAATTTAAGCTGAAGAATTTCTCAAAATGGCACATAGCGGCGTCACAAAGGATGACGTCACAAAAACCAGTAAAGTGAAATTTTTAATTATTATTTAAAGTGTCTCATCTAAAAGTGGGTTAGCCTGGATTTCTTCCGATTACTTCGAGTCGTTATTACTCCCTCAGTAACGTCTGAATCGACCGGCCGTTAATGCCGCCAGTGACGTCAATACTCCTTTGCTTTAATTCATGCAAAAAGTTAAACACGATTTTTAAGTGGCAAACCAAGAAATATCGTTCGGAAATGTCTACATGATACAGAACGGTTGACAGAATTGCTTTCATGCGTTGCGTGAATTTCTCGAACGTTCTCTACGTTTTTGGATTATTCATGAATAATTAATTAGGGCATGTTTATATTTCAGCATGAGTCAGCAGATTTGCATCATTTACTGAAATTATAAAATATGTCGAAAAGCTACTACTATTCGCTTGTTTAGTATTTTCTAGAACTTTATGTCAAACGGTATACAAGTTCCAAGCGAGTTCTTTTGACGCTCAGGCCACTTAAGGCATGACTATCTAATTAAATATGCATATTTTGATAAAATGCTATTTAGAACGGTTTGCTTCTATCAATTATCCTAAAAATTGAACTGCAAATGTAGCTTACTGGCCTTAGCAATATGAGTGCATTTTAGAGGCCCAAATTTTCGTCACGTGACTGACAATTGAGAATTAACGGTCAAAATATTTACTCTAAAATGAGGAAAGGATAATGAGAGAGTAACAAGTTCTTTTAGGTGCATTTCGGAGACTACTAAAAAAGGTTCTTAAAAATTCGCCTTGTATGTGTGTTTACGTTATATAGGAATTGGTAAATAAAAGAAAAAGGGTCACGTGACTTATTAATGAGTTAAATAGCTAATAGGATAAATAACATTTTCAATCGTACAAGGCGAATTTACGATTTACACTGTTTATGTAACATTTTGGTAGTGAAAATTCCTAAGATTAAGATTTGCAACCCTTCCCCGTTTAGGAAAACGTGTTTTGTGACGTCATTGACGAAGCAATATCACGTGTTTTTTTCTTTGAGCCACATGTGTATCTCACTTTTCAGGATTACTGATGAAAGCAAACCGTTCTAAATAGCATTTAACAAAATATGCATATTTAATTAGATAGTCATGCCTTAAGTGGCCTGAGCGAACAAAGTTTTTTCCCACGAGTCTGGAGTGCTGTGTTTAGTCAAGTGTGACGAAGGTCGATTTTCAAGTTCTGTGTTCACATTTGGCGGAAGCCGTAAGATATCTGAAGTTCAAGTGAGTGGTTTGTTATTATTGGCAGAGGCTGTTTAGTTTTACTTAAGTTCAAGTCAAGTTTGTGTGCTGTGTTTGCTGGATGCTGTGTTTTAAAGCTCTGTAAAGACTATGTTCCTTTGGTATTAAACTTCCTTGTGTTAATCCGACATCCCTGCGTGCTGATCGTCAGTGAATAATTATCCTCAAAACATTCGAACTCGTAACATTGAGTTTTAGCCAATTCCATGCTCAACGTAATTGACTCCTTACCCATGTCTTACATATCTTTAATCAGTACATGAGACTGCTATGCTGTTTTTGAAGCCAGATGTAAGAAAAATAGAGAATTCTTTTTTCGCTGTTTGTTTTGTTAGACTTGTTATTCACCGCTAGTANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCCCCCCCCCCCCCCCCCCGACGTCGGTTTTTAGGGCAACACAGCGTGGCAATGTTGGAACAATGTTGTAACCATTCGCAACAATGTTGCAAAATCGTCGTTGCGAGTCGTCTCGTGTAACATCACCTTAAAGATCTTGCATTGAGCATTGAAACTACCAACTGAGCGAGGCCACGAGGTTATATATTTAAACGTTAAGACGTCAGAGAATTGATTGATGTGTTCAATTTTTCCTCTGGACTCTAAGGTTAGCCATTTTCCTGGCTGCCAACAGAGCTGCCAACTTGTTGGCGTGGATTTGAACAATGGGGAAAAGAGAACAAAAAAATTAGTCTGTAGTACAGTTTGAACTGAATCTGACTCAGCTTTTTCTTGCTTGTGTCTTCGAACTCGTTTCTAGATAAAGCACTGCTTTTCAAAGCACGAAGAAAAAATAACTTCTTTCCAGAGGGAGTATATGGAGTAATGACGTACTGCATAATCGCCATTGACAAAACGCTGGCTATCATGTTTTATGTTCCTCATGATCAAACTTTTGACAGAAATTGGTGGAATGTTAAGCTTTACAGTGGATATAAAAGGAGCAACTCTCAAATGCTTCACGAAATGTATGAAGATCATAGCGCTATCGAGGGTGATGCCTACTGGCACTATAAAGACCTGGGCCTGGGATCAGAGCTTAGAGCAACTGGTGCCATGGCTTCCTCTGGTGCAGCTCATCTTATGATTGAAATAAGCAAGCAGCACCGAAAAAGACTTTAA
>gene-Peve_00000122
TTACTTATGCATGACGCGTACTGTCCTATTAAACTGTCCTATTAAGGCTGAAATCAGGGCAGTTGAGAACCAATCAGATTTGAGAATTTTGTTATAGTTATGATTACAGACCGAATTGGACTCCACTCAGTCCTGTTACCATTACATACTACTGAGCATATACACACGGGAAAACGCGATACTTATCTTAAACGAAAATTCACGGGTAGCCTGCGAGGCATGTTAATACTATCGATGCATTGGTACCGGAGAAAGAGAAATTCAAGTTAGTTCCAAGGGAAAATATTTCACCCTTCAATGTCAGCGTCTTTGGTTGTCGCCAGCACTACCACTATGGTATAAAGAGAGATCGTATAAAAAAAATAAACTTTGCTTGAACATATTTATCTCGACAGGAACGAAAAGAGCAAAGGACCAAGCGTCAAGAACCAGGTCAGTAGTTCATGTATGACCCCATGAAATGTCCTGAGACACCAGTGCCGTGTCGGGTCTCTAGGCACAGAAGCAAGTGTCCAAGAACCAACACGTAATATACCTTTCATTTCCTTCTTTATTGTGTAACATAGGTCAGGTGTTCATACGTTAATTAATTATTTATTTATTTATTGAGAAGCAATAACAGACCCCATGCAAAGGTCTGCTCCCAAAATACAAAAGAGATCCTCAAAATTTTTTTCGAAATTGTATGGCTTAAGACATCTCGAAGAGTCTTTCGTGTTGGAGCCCAGTATGGACCCAGTGGAGGCTGCGAGTTTATATATTATAAATTTGCCGCAAGTGGAAGGTATCTTGAACATATGCCATGCATTTGTAGGGACCGCGGAGCAGGGTTTGGAGTTGTGGGGGGAGGGGGCAGTTAATCTGAACCCGTAAGGGTGAGCCTACTAGGGGATACAGGGGCATGTTTCCCCTGTAAAATTTGATTTTAGGAGTCCTGAAACGGCTAGAAAAGCACCTAAAACTATTATACTTAAATAAAAGTAACGCAAGGCCAAATTTTTAACGATCATATCGCTGCTAAAGTCTGTGAAAGTTGTAACCGAGCTGGTGCTCCTTCAACTGGAAAAGGCAAGATAATTAAAAGGTATAATCGTACCAGTCTCACAATATTTAGGCAGTAAAATAAATCCAACTCCATCAATGACTTGTGCCTTGTTCGAAGTTGTTGCGGATTGGCCGCTTCATCTTCTGACAACTGAGTATCTGTCATTATTGGCTCAAACCTACCACACCACGCTGTTGAATATTTTTAAAAGACGTTTTCTGAACGTTCTTCCCTTTTTATCCCTACCAAAGCCTCTCGTCAGGTTTCGATCAGGTTCCTCATTGCGTGTTTCCAACTTGAAGTTTAAAACAGCATTGATAGAAGAAAGAAGATTCAATAACAAAACCTTCGTAACGTCCCAGTGTAACAAAACCAAACATTCAGGCCTCCTGTTGTTCAATAGGTGGTTAGCGCTACTCTCTCCTCCGCAGAGGCGTCTTTGTGTCGTAGGGAGGCTGGGGAGAAAGAAATAGAAAGCGCGCGGGGCACGGTGGGAAGGGGAGAGAGAGAAAAGAGGCTCCCGTCTTTTCCCTCTTCCCATCTCCCCCCCCCCTCCCCCCCTCGCACTTTTTATTTTTCGATTATTGCTATTTTTATTGTGATACCCAGCAAGAGCCTCTGCGGAGGAGAGAGTTAGCGCTAGCCGACGGATAAATCACTATCCAGTGGATAAGTATTAGGGGTGAATCATTTGAGCTATCCGCTAGGTAAAGATTTATCCGAGCGCTATCCACCTTTTGAACTACTGAAGCTGGCGGTAAAACATTGATCCAAAGGCAAATTAATTCTGGAAAGAAACAGAAGTTGTTAACTCCATTATTTTACCGTCAAAAATCCAAGCGTCCTTTCAGCAACAATAAGCCTTATCACGGTTGTCGACGACATCTTTTGAAAAGTGGGCAACGGCGTGAGAAATTACAGTGTTTGTATGAGAATCTAATATCGAATAATTTCCCTAAAACGGCGGTTCTGAAAAAGATATGAAGTGTAAACTAAAGCTTTGACTCCTAACGATTCTACTGAAAAAATGTAAGGGCGTAACATGCAGTTACATTCGCTAGAACCACTTTATAAAATTTTCTATACATTTGTCTGTAAAAATTTCTCTTCCGACTAAAATTTTCCGGGAAAAATTGCAGACACCGTGAATATATGGAAAACCTAAAGCAAGCGTTCTGGAGAAATTGAAGCACAGTAACAGCCCCCAAAATTTGTTAGTAAAATCCTTTCCTGTGTAGCTTTATTTAATTCATATATCTTTTTTCAGAACACCCGTTTTACGGAAATTATTCGACATTAGATTCTTCTACAATCACTAATTTCTCACGCGGTTGCCTACTCGTCAAAATGGCGTCGAAAACCGTGATGAGGCCTGGAAGGAACAAAAAGAAAAAACAATAGTATTGCAAAGTTTACAAACCAGAGACCAGATTCCGGACAGTTATGTGTTATCACAGCATGGAATTTTTGTCACTGCAGGCGCAGACGTCCCTCCTGGCGAGACGTCCCTTGCGGCGAGGAGCGAGGAGGGAAGGCTGTATTCGCAGGCTAGCGATTGTGGTGATCGTATAGAAAAAATGCACACCCCTAAACGTTTTTGTTTGGCTTTTTTCACACATAGCTGACATAATCGCAGCGGCTGCACTAACTGTTGGCGTACTTCAAGGGATCTTGGATGGCATCATAGGTGTAGATCGCAAGATAGCCATCGGCATCAAGAATAGAAGCGGATATAGCTGGGGAGCAATTGATATTTACTTGCAACAAGGAGTTACAGATAGGGTCATTCCCCGTAGAGTCGATAATGGTAACTATGAATGTCACGAATTTAGCATGAAAGCGCTTATCGGGGACACTGTTATAATGTCTCCTTTAGGAGACGGGGTGGCCATTTCTACGTAGTCTTCGCCCAATTTGATTAATCACTTTTTGTGCTGGAAGAGAAACAAGTCAGTACGTGCCCCTCCTTCAAATTTTTTTCCATTACTTTAAGTTTTTGTTATGGTTTTAGCTTCTCCCTCTTCTTCTTACTTAGGATTTGAGCGTAACCCATTAAGGTAGCTCAAGCTCGAAATATAAATTTGACCAGAGCGCCGTTGTGTCGAAATATGAGGATTTGTGTGGGAAAACTGATATAGCTTTTTAAGGGGGAAATTCGGCTGTTTGTTTTAGGGGGGACATGGGGCTGATTTGAGAAGGAAAGAAATAAGGGAAGAAAAGGTATACACATAAATGACCAGTTAGTTTGGCATTTGTCTACTAATTATTCTCTTCATATTTTAATTATTAACCTCCCCAAATTCAATTTTCCATTATTTTAAACATTTTCCCCCAAATTTTATTCATCAAAACCATCCAAAAACAGTCGCACATTGAGCCAACGACAACGACCTTATTGAGCCTTTTATTATTCAATGAAGTCTTGTCTAGATTTGGAAGAAAAAGAGCAGAAATAACGAAAAACAATGCTGTTGAGTAGAATTCCCCCTTTACTAAAACTGTAAACCAGTATTGAACAAAAACTTAACCAGGTCTAGTTCTCTTTTTGTTTGCCATTTTGCCTGATGTGTGGTTTATTGGTGGCCTTTGGTCCACTCCGAGTTTTGGTTTATATACCAACCCTTTAAAATTAAAGACCCAAAGGTCAGGGGCACCCAACGAGAATATAGTTCAAAACCACTTAACATAGCATTTTTGAACGTAATTTAGTCTTTAAACTGTAGATATAAACATATTTTTATCCCCTAACAATTTTTCATCTGTTCGGATTTCCTAGGTGAAAATCTAGGGATCCGAAAATTATAGGGATCAAAACTTAACTTTTCGAAAATTTCAGTCAGAAAAAAGGCTCCCGAAAATTCTAGATGACCTTTTTAGGGTAAAAATCCGCTTAAAATGGGCAATTGTAGCATTTTTTACATGTTCGAAAATCCTAGGAGAGGCAGGCAAGCAAGAAATTTTATAACAAATGTTCCGAAACTTCTAGATCTCAAATCGTCTTCCGAACAGATATTTTCCGAAAATTGACGTTGGGTGCCCCTGAAAAGTCGATAAGAATTCACGCAGGACGAAAAGAAACAAAGAACTAAGCGAGGTAGAACCGAGGTAGGTTATTTCACAATGCAACACTGTTGCCCAAATGTTAACCATATCGGCTTTTCCCCTACAAAACTCCATATTTCGGCTCATATCTCGAGCTTGGGAAAAAACCTCTTCGACAAGTTATGATGTTGTTTTCATGATGTTGATGATAAGTTCATTCAATATCGTAAATGCTCTTGGTTTTTGGCCATCTCAGTTTTAAAAGGAATAACACACAAACTGTGGTTAAAAACATCAGAAATAACAAACGCATTGTGTAACTGACTTTTATAGAGGTCTTTATAATATCGAACTTGACGTTTCGTATCCTTCAACTGACATCTTCAGAATTGACTGTTAAGATGAGTAGTAACTACACTTAAATAAAAAAAAGAACAAAGTAAGATAAGTAACTAATTAACGTGAAGGAAAAAACAAATAAATGTGACAGATGAGAAAATTTCCATGAAATTATGCAAACTAGTTTAAGTAAAGCTTTTCGCTGCTTATATTTTCATTTAGGGCTGGTTTTTAAAATCCCTAATAAGGTTATTTTCCAAAACACGCAAAACACACATCGCATCAAGTCTCTTTTCCCCGACAAAGACAGGCTCAACCGTTCTCAAATGTCCAATGTTGTATATAAAGCTAGTTGTTGGGACTGTCTGGATTTCTACATTAGCAAAACCACACGGAGATTGCATGACCGAAAAACTGAACACTTCAAGGCAATCACCAGTATTAACGGTCATTTATCAGCTATTGCAGAACTCGTCACTTCAACTGTTTACAATTTGAAATGGCACCATTTTGACATTTTAGCAAGAGGAAAATCAAATACCGATTGTAAAATCAAGGAGGCTCTTCTTATTAGGGATTTAAAACGAGCCCTAAATGAAAATATAAGCAGCGAAAAGCTTTACTTATACTAGTTTGCATAATTTCATGCAAATTTTCTCATCTGTCACATTTACTTCTTTTTTCCTTCACGGTAATTAGTTACTTAACTTACTTTGTTTTTTTGATTCAAATTTAATTACTACTCATGTTAACGGTCAATTCTGAAGATGTCAGTTGAAGCATACGAAACGTCAAGTTCGATATTATAAAAGTCAGTTATGCAATGCGTTTGTTATCTCTGATGTTTTTAACAGCAGTTTGTGTGTTATTCCTTTTAAGTTTATTCAAGAGCCATAATGACCATTATGGCTCTTAGTTCATAATTTTACTTTAAAAAATAATGTAAAAAGTAAGGGAGAATGCCCGCAGGGTATAGAGGAGTTAAGTTGATTTTCTTTATGGGTACCTGCAGTTGCCTGCAAACCTCGCTTTAGGTATGTGACTTCCCAAAGGGTAGGATTTTTGCGCCGTTTTGGTTTGAGAACGGGTACACGTAGGGGTCAAACCACCCCCCAAATGTTTCATGAAACCTAATTTTGGTTATTTTCAACATTATTATACTAAACTTGTGGTGTTTGCTCAAATTGATGTCCACTTCAATTTAAGCTGAAGAATTTCTCAAAATGGCACATAGCGGCGTCACAAAGGATGACGTCACAAAAACCAGTAAAGTGAAATTTTTAATTATTATTTAAAGTGTCTCATCTAAAAGTGGGTTAGCCTGGATTTCTTCCGATTACTTCGAGTCGTTATTACTCCCTCAGTAACGTCTGAATCGACCGGCCGTTAATGCCGCCAGTGACGTCAATACTCCTTTGCTTTAATTCATGCAAAAAGTTAAACACGATTTTTAAGTGGCAAACCAAGAAATATCGTTCGGAAATGTCTACATGATACAGAACGGTTGACAGAATTGCTTTCATGCGTTGCGTGAATTTCTCGAACGTTCTCTACGTTTTTGGATTATTCATGAATAATTAATTAGGGCATGTTTATATTTCAGCATGAGTCAGCAGATTTGCATCATTTACTGAAATTATAAAATATGTCGAAAAGCTACTACTATTCGCTTGTTTAGTATTTTCTAGAACTTTATGTCAAACGGTATACAAGTTCCAAGCGAGTTCTTTTGACGCTCAGGCCACTTAAGGCATGACTATCTAATTAAATATGCATATTTTGATAAAATGCTATTTAGAACGGTTTGCTTCTATCAATTATCCTAAAAATTGAACTGCAAATGTAGCTTACTGGCCTTAGCAATATGAGTGCATTTTAGAGGCCCAAATTTTCGTCACGTGACTGACAATTGAGAATTAACGGTCAAAATATTTACTCTAAAATGAGGAAAGGATAATGAGAGAGTAACAAGTTCTTTTAGGTGCATTTCGGAGACTACTAAAAAAGGTTCTTAAAAATTCGCCTTGTATGTGTGTTTACGTTATATAGGAATTGGTAAATAAAAGAAAAAGGGTCACGTGACTTATTAATGAGTTAAATAGCTAATAGGATAAATAACATTTTCAATCGTACAAGGCGAATTTACGATTTACACTGTTTATGTAACATTTTGGTAGTGAAAATTCCTAAGATTAAGATTTGCAACCCTTCCCCGTTTAGGAAAACGTGTTTTGTGACGTCATTGACGAAGCAATATCACGTGTTTTTTTCTTTGAGCCACATGTGTATCTCACTTTTCAGGATTACTGATGAAAGCAAACCGTTCTAAATAGCATTTAACAAAATATGCATATTTAATTAGATAGTCATGCCTTAAGTGGCCTGAGCGAACAAAGTTTTTTCCCACGAGTCTGGAGTGCTGTGTTTAGTCAAGTGTGACGAAGGTCGATTTTCAAGTTCTGTGTTCACATTTGGCGGAAGCCGTAAGATATCTGAAGTTCAAGTGAGTGGTTTGTTATTATTGGCAGAGGCTGTTTAGTTTTACTTAAGTTCAAGTCAAGTTTGTGTGCTGTGTTTGCTGGATGCTGTGTTTTAAAGCTCTGTAAAGACTATGTTCCTTTGGTATTAAACTTCCTTGTGTTAATCCGACATCCCTGCGTGCTGATCGTCAGTGAATAATTATCCTCAAAACATTCGAACTCGTAACATTGAGTTTTAGCCAATTCCATGCTCAACGTAATTGACTCCTTACCCATGTCTTACATATCTTTAATCAGTACATGAGACTGCTATGCTGTTTTTGAAGCCAGATGTAAGAAAAATAGAGAATTCTTTTTTCGCTGTTTGTTTTGTTAGACTTGTTATTCACCGCTAGTANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCCCCCCCCCCCCCCCCCCGACGTCGGTTTTTAGGGCAACACAGCGTGGCAATGTTGGAACAATGTTGTAACCATTCGCAACAATGTTGCAAAATCGTCGTTGCGAGTCGTCTCGTGTAACATCACCTTAAAGATCTTGCATTGAGCATTGAAACTACCAACTGAGCGAGGCCACGAGGTTATATATTTAAACGTTAAGACGTCAGAGAATTGATTGATGTGTTCAATTTTTCCTCTGGACTCTAAGGTTAGCCATTTTCCTGGCTGCCAACAGAGCTGCCAACTTGTTGGCGTGGATTTGAACAATGGGGAAAAGAGAACAAAAAAATTAGTCTGTAGTACAGTTTGAACTGAATCTGACTCAGCTTTTTCTTGCTTGTGTCTTCGAACTCGTTTCTAGATAAAGCACTGCTTTTCAAAGCACGAAGAAAAAATAACTTCTTTCCAGAGGGAGTATATGGAGTAATGACGTACTGCATAATCGCCATTGACAAAACGCTGGCTATCATGTTTTATGTTCCTCATGATCAAACTTTTGACAGAAATTGGTGGAATGTTAAGCTTTACAGTGGATATAAAAGGAGCAACTCTCAAATGCTTCACGAAATGTATGAAGATCATAGCGCTATCGAGGGTGATGCCTACTGGCACTATAAAGACCTGGGCCTGGGATCAGAGCTTAGAGCAACTGGTGCCATGGCTTCCTCTGGTGCAGCTCATCTTATGATTGAAATAAGCAAGCAGCACCGAAAAAGACTTTAAACATTAAGTCATGTATTCCCTTTATTACAATAACGTGTGTTTGTTTTTCTTTAATAAACTGAGATAATCTTAAGTCTAGAGGCCTGGCCAATTTGTATAAACTCCTCTAAAAATATCCTCAGAGTTGTCCTTAAATTTTAAATGCTAAGTTTTGCTTTATGTCAATGACTTTAGTGGGCCTGTTAATGGAAATAGCGAGAACGTTATTCCATGGATCCTACCCATTTGTGGCTGTACGGATCTGTCCTAAATGGTCGACAACGATTTAAAGCTAAAAAGGCCGGGATTTTCCGAGTCAGGGCCAATTAGTAAGTTCAGCATAGCATGAGCATGCCAGTTGCTATGAAAATGCGGTCTATATACTTCTCTATAAAAACCTTAACTTTTATTTATCCTTTCACCCTGCAATTGTTCATTAAATATTTCCTAAAATGTGTGCCAACACAAGGGTATTATAAATTAAAGGTTTCCAAAGAGTGGTATATATAATGCACCGATAATTTTCAAAACTTTGGGCGTCTACAACAATGAAAGGATTCCTGTGGGTGTCTCTACAAAGTTTTCGGCTTTTGGGCAACATCAGAAACTCATAAGCATATTGAACAGCAACTAACAACTTCTCATGACCTGTTTTAGGAAAAAAGAAACCCGGCCCTAGCTATCTTTAAATCGTAGCACGTAAATCGTTTAGGGCAGATAAGTATAGCACAGACGAACAACTGACTCTCAAGTTGCAAATATTACATATTTGGCTTATTACCAGCAGTTTCTATTATTTTTCCATAAGTCAGTAAAGCCGTTTTGAAAAAATGTCTAATGTTTAATCTGACAGAACGAGAAGTTGATTAGTTACTTTGAATTTTATCCAAACTGCTCTAGCCTTTTTTAGATTTTTCGTTTGCGTGTGTGTGTGTGTGTGTGTGTGTCTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTTTTTGTTTGTCACGATATGGAGCCTATAGCTCCGTATACCACTGTTAGGCAAAAAAAAAGGGAAGGACGGAAGGACCAGTTGAAAACCCTCTATTTTCGTACACTTTTCAAAATAAAAACCTCGAGTTTTTATCCGAGACGTTTTAGAAAGGACTAAATGTTCTTACATTATTTTGGCGATTTTAACTTATTTGTTGCATTGTTAACCTTGATACAAAATAGTGCCATCAGTCGATTGGGATTATTTGGACTATACTTAAGTTTAGACCTAACATCGTCAGTTACTTTCTATCTACGGTAACTTCATTATAACGTTGTATGCTGCACCGCATTATCTTGGCCTATGTTCTTAATATCTTCTTAAGTTTTGTCCATTTTCAACGGCTTGAACGTTCTTTTGCTGTTCTAGTTTTCTCTTAAACGAAATGAAACTGGTCATTTCGTATCGTGCTTTCCACGTTTGTCTTTGAAATTAACGCAGTATCTTATTTTAACGTAGAGAATGTGTGCAAAAATCCGTTGAAAACTAATGGTTCTTATTTGACATTAACCGTCGTTAGTGTTGGAGTTCAGTGTGTGAGCATTAATTCAGCTGATCAGAATAAAAGCATACATTCATCCTTTTATGACTCCGTATGAGAATTTTTGTATTAAGAATACGAATTAAAACATAACTCCACTTTTACCATTTAATTCGATCTCACCTAGGGGAAGGGGGAAGGCATACACATAAGGGAAGTTTGGGTAGGGGTCGAGACAATCCCCGTTTAAAAAATCTGTTTAACAGAGAATAAGATTTAGCAAAAATAATAATAAAATCGTTGGTACCACAATGATTCAGACCGCGCACCACCAATGTTCACACCTTAAAGGCGCTAAATAACGCAATTGTATACCCTCTTTAAGACTCAAGACCCAGAAAAACAAATCCTGTTGGGCGGGACATACCCCGGTTTAGGCACCAGTCAGTTGCATTGTCTTCTAAGGGTAAAAACTCTTTAAATGTTCACACCCTCCATCGTTCATATAGAATCGTTTTCGCCATTCAAACTTCATTGCATTTGCATTTAGCAAGAGAATTTTAAACTTCACTTTTTGATAGTTTCGAGAATCGTGATATAAAAACTGGCCCGAAACGTGCTGAATATCTTTAACTAACTGACAACAAAATGAGAACGTGATAAAATTATTTCGTCAACATTTGTGAACTTTTGCCCTCTCACCTCTTCTGAGCATGTTCCACATGTTACCACAGCTCAGGAGTGAGAATCATTGTAGCGTAACAAAAATCATTATAAGCTGGGTTTTTGAGGCGATTTTAACGGTCGTCACTTAGGAAAATTGTAGAGAGAACAAAGAAAGAACTAAAACTCGCTGGAATAGCGCTGAAAGTGTCAAAAATGCTAAGAACAAAACTAGGAGATAAGGTAAGGAGTTCTAGAGTTTATTTTGGACATCTTAATAATAATCACTAAACATGCTAATATCCACATAATATTTCTGTTTTCTTACTCTTGAGCTTGTCACCCCTCTCTAACAGTTATCAGAGTGTAAAGTCTCATTATGATATAATTGCTTTATCGCCCATAAAGGTATGGAGAAGAAAGAAAATGATCAAAAGTCTTCCTTGTTAATCAGTTTCTGGTTATCATAACTGTAGCAGAAGTACGGAAAACAGCTAGTGAGGAAAATACGACACATTTCCAATGATATGGCAACAGAACGTTAATATATTTTTAGGCTTCCTTGCGAATATCCAAATGCAGTAATTATGTGTCAGTTAGTAAGATGCCAAAAGGACGTGCGATTTCACTATGTTGACTATATTGCCGAGGGATTCTTAATCCGATCATACGGATTGGGGGATAAATCCTTTAGCAGTTTGACACTGTCCGGAAAAAACCTGTTCAATTCTTAACACGACTCCTGAACACAAGATTTTACTGCACCGAACAACTCCTTCATTGTTATGTGTCAGAGCAGCCAATAAAGACGCGTGTTGCATGAAGCTGGCAAAACTCCAAATGTAAACAACATCCGGGATGTCAACTTGTGCTTGTATATTTACTGGAGCATCGAACGTCCCACCCAATTTTACGCAAGAGCCGTGTTTATTACTGATCTTGCACCTCTGTTCCACATCAGTTTCGACTCACCAGTTCCTTCGCCTAAATCATGGCAACCTATATTCCATTAGTCACACCACTGGGCGAATTGCCGGAATTTCCCGAGGAGGAAAAGATTTACGACCTAAAAATGGAAGGTGGAGGAGTCCCACCAGAAGCGAATGAAGTGTTCGTGTACGTATTTGTCACAACACACGGGGAAGGATCTTTTCAGAGGGGATACTACGAAATTTCTACAAACATGCTGGGAGGACCAGATCTCAAGCAGTACATGAATGTGGCGACAGGGCAAGGCTTGTTGGCCGTTAACTCTGCCAATATGTGGTTTCCTGTTGGAGCTGGCCAGTTGAAGGTAAAGTTGGTCCACCCAGGTGAGGCTAAGAAAAGCATTGCAGGGAAGACCGACGAAAAAGATTGGAGCGGAGTGTTCATCATTGGATACCGATAGAGTGTTTGCTAGTTGGAGACCCGATCCGCGCACTTTCAATATTAACCAAGGAGCTGTTAAAACAATATTGTAGTTAACAGAGCAGAACTAATTGCATAAGACAAGGAACAACAACTAAAACGTAGAACTGACGCTCAGTAAGTTTACAGTAGAGGTGGCCAAGAATCAGTGTTTCATAACACGATTCTTTCTGAACAACAAGATATTCACCAACATGTACGCCGTAACACATTTTTAAAATAAAACTTGCAATAGAATTTACAATTTCTTTCCTGTTGTTTGTTTATTCATTTCAGGTTCAGAGACATGCACTCCCGCACACACAGCATTTAAAAGAAACTTTAACCCTGCGTCTCAACCCACATATTCTTCACACTGTTCACAATTTACTTCTTTTGGCTCTTTTAAGGAGAGCTCATCTAAATATTGGAACATTTTACTTTTCTTGATTATTTTCTCTACTCACACTACCTTCAATGTTTGAATTAGAATTCGTACGATAGAGAGAGGTTAGATGCTGATCACTGATAGGGTTGAAAGGTATAAGGGTACCTCATTTTTAAGAGACCGTATCCATTTTGCGGTAAGATGTGTCGACGGATATTGGCATGGCTTTGCATCGTACAAAAAAGGTAATTAGTTTAGGATGACCAGGCCGACCCCCTAATCTATAAAAACACTTCAACCATTTAACGCCAGATGCACTTGCAACTTATAGTACTGAAATTCATGAACTTTTGCCTGTTATGTTAATGGGGAAACGCATTTCTGACCGTTAATGAAGACTTGGTCACGAAAACTATTTGCAGGTGGATCATCGTTTAAAGAAGGCTAAGTTCAACTGATATATAAAAAATATATTGATACATACAATATATAATTCGTGTTAAGTTGACATTTGTTTTAATCGGCGTCGCTGTCGTCGAAAAGCTGTAAAGGCAGCGTGCAAACGGACGCAAACGGACGCAACAACTCCCAACATTAGAGGTAAAGACAAATTTTGAAACATTTAGCTCAAAAAAGGTCGATTAGAAAAGGAAGTTTTAACCCCTTTACTCTATTTACTTTCGTGAGCACTTAAGTGGACAATTAACTGATGTGCAAATACAAAAAAGTAGCGGGTTGATTTTTACACGGCAACTGTCTACTACATTAACTTAGCGTCATTATTTCACGCACGCTCTTAAAACGGGGGGTAAGAAAAATGGGAAAGAGATCCATTTGCTATTGCTCGAGAGAAAAAAAAACCGCTCGTTTTTCCCCACACGGGGTAACTTTGCTCTAATTTGTTAAACAAATTCAAAGCAGTTAATAATTTTTATTTTTTGTTTGTTTTTTTTCGACAATATATGAATTAATGCAAACTTTTACTAGCTGTGTCGCTCGCTCAAATAGATCGATTAATGGAGGAACTGACTTTTAAAGTCAATATCGTTTTCCTTTACTGCATTAAAATTGACACGGGATTTGGTCCTCCAGAGAGGCTTCTTAGTACACTAGCAGGCTGAGACCAGGGTAAAAACAAACCGCAGGGAAACAATGGAAAAGCCCAGTAAATAATTTAATTTATCACCTTTTTGTTTCCAATATTAGTGAGAGTAGACTGAGTGCCGGTGCTCCAGCCCCTCTAATAAAATCGGGTAATTTCTCTACAGTGAGAACCTCTGAGAAAGAGCACACCGGCGGATAAAACTAGTAGTAAAAAAAAAACAGATTATGGGTACTTACCCTGAAACGTCTTCTTACCGCATCCCAGCACAAACTGAATGATGCTGGCAAGCTCGGCCTCTCACACGCAGACACAAATACAGTGCCGTATACATACTAGTAATAACCAAAGGTTACGTAGTGCGGGCGACAACGATCGAAGGTAAAATCGCTTTTGCTCCAGCGCTGTGCTTTGCGTAAGTTTCACGTGACTGATAGTCAAAACATGCATAACCAGATTACGAGTGATAGAAGGTCCAAACGCGCGTGATCAAAGTGTGTTCTGTGCATTCTATACATTCATAGTGGAAGTCCAAACGAATGCTGGCTACATACATGCGACGCATGCGTAATAACAACCTGGAGATTAGGATTGCGCGCTCCTCTTGTTGCCCGCAATTAGAATTGTAAATAGGACATTTGCATGATTGCGTCATTTTACTACTATGACCAGGATCCTTCAGGGTTTTGCTTTCTTCTGCAAATTAGGGCTTTTGTTATTTAAACCTCACTGGGATTACCATATTTAAATATGAAAAGAAAAACGAAAAATTCTGGTCCTAGTAGTAAAATGACGCCATCGTGCAAATGTCCTATTGTAGACTGGTGTCTGCTTTAGAAAGCTCCCTCTTGACATTTTTCATATGAGAAATAAAATTGACTCGACGTCATTAGTATTAGAGATTGTAAGAGAATTGTTACTTTTTAATTGGCACATTTGTTTCTATAAACATGCCAATTAAAGGTCACTAGATCTCACTAGCTCACAGGTACTGGCAAAGTTTCATTCTTTCCCCAGTCTCCCCATTTGGTATCGTTTAATATCTAGTCAACAATAATACTGACATGAGAATTTTTATGCCACACTTTATTTTTCTTTTCGATGAAGTCAACGTGTCCATTAATCAAGTAATATTAGACTATTATCTACTTGTCTTTTACAGGTATTAGGTTTATTTACATATATGGCTGGTGGGAGTTTCATCAAAGAGACATTCAGAATTGTAACTTTTGAAAATATCGTCCTCATAATCTTATATCACAATCGCTGTGATTCACTGATTATATTTCGTATTTGCTTCTTTACCTTCTCAGCTTTAGTCCTTTCCGTCTTTTTAAGGTCACAAAAACATTCTCAAATTGATGACGATGAATAGAAATCAGGAAAACACCTTTTAGTGCATGATAAATATGCTAACTTAGACTCTTCCAGGAAAAGAAGTCTTAAAATTAGCTATATACTTTGATTCTTTTGGGAAAGTTTTTCTCTCGATGTATAACAATCAGAAAAATGTTTCATCACATTGACCACCTTGGTTGTCCCATTTAAATTTTTGCAAAGGAACTTAAATCTTTTTTAGAAATGAAACTTTACATATAAAGTTAACCAATGACAAGAGACTATTTACTGCTTCAGTATGTGAATTTCGATGGTGGCTTGACCGGAGTTAGACATGGCATCCCTCAATCGGAGTCCCGATCCCAAATTTCTATCGTTCCAGTAGTTATCTCCTTTGAAGGGATTATCGTAATACATCCGGTACCAAAGATTGTAGTTGGCGCGTCTTTTACCACTGTAAAGCCAAACATCAAACCAGTTGCTGTACCAGTTGTAGTCAAATGGAACGGAATACATAACAGCAAGTGTTTTATCGATGTGTGGGATGTAGTATGTTAATACTCCTACTGCGCCTCTCGCAACGGGCCCCGCAGTTTTTCGTGCGCCGTAAAGCAAAGCTGTGCCTAGAAATAAGCAAACAAATTAGCAATTAAACAAAAATCATAAACATGATCATTCGTCACAATGATACATGGCAAACGAGCATCACAGAATAGAGGCATGTACCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGCTACTGCTACTGCTACTGCTACTGCTACTGCTACTGCTACTGCTACTGCTACTGCTACTGCTACTGCTACTGCTACTGCTACTGCTACTGCTACTGCTACTGCTACTGCTACTACTACTACTACTACTACCACCAGAATCACTCAGGGTTTTGCTTTCTTACAAACGAGGGCTTTACTTTTCAAACCTCGCATAACCTGTGATCAGGCCCCCAAAGCAAAACGGGAAGAAGGACAGCCTTATTACAGTTTAAACTCTTCGCAGAGATTGAATAGATTCTATAGCAATAGATCAGACCTTTTTAGCTTGTATGTTTTGTTTTCCCATTTCAGACCAAGTGATGGTACCCCAGAGAACTGTTTCTTTCAATGTCGTCTTTTGCACGTGCACCCACATGTCAGGATGTATGCATAATTATATAAAAGACAAATTCCCCGAAGAACGTTGGGAAAACAAAACAAACAAGCTAATGACACCATCCTGCATACGGGCCTCAGTGTCATTACTAAAACAAACGGCTAGCCTGTGCAGCCTTACGTTTCAGCTTCTTTTTGTTTCTCCCTTTTGTTTTCAAAGGTTATGGTACGCAACTTCTAAACAACCGATGCAATTACCTTTTCGCTTGTTTTTCTCCTCGCCTTCTTTACGAGTAGAGATCAAGGACGAGCCCTGTCGCCCCTGTATCTGTGAAGGAAAGTATGTCCATTAAGGTCACTCCACTGAATACAAACTGGAATGAGCATTCTGTTTTGTCACCAATTTTTCCTCTGTGTATTTCAGTACATTTGAGGCACAGAAATGAAATCGGAAGCCAGCTAAAAATTAATTATCGTTGCGAGCCTTAGCGAAATCACCAAGAATTGCGTGATGGCTGTGTATTCATTCAGTTTACACTTCGCCAAACTGAACATTGATTATTGAGGGACACACATCGCAAAGAACTCAATAGATTTACAGCATAAAATTATAAAAATGTTTATTATACATTCAAACGTTAAGGTGTAAAACTACCTAGTATCTACTCGCAACTGGAGGGGGGGGAGAGGTTGGTCGGTTTTATATTCGCTCGGAGATTAGGGTTCATTTTGTTTCACTGAAACAACCAAGATTCGATTAGAGCATTGAGTTGCATTTCCGTAGATATCACGCCACCCCCAAGAGAAATTTTCATGTTTGGCTTTAATACAAGTCACCAATTGACTATCTTATTTTAAAGAGAACTATTTTACCTTAGTGTGTTTAGAGGACAATCCAGGACTCCTTTGGACATGATTGTCTTGCTTGAAGGGAGACCCAAAGATTGACTGAGTTATCAGCAGAACGAAAAACATAAGGGTAAACAGCCTAACTCTACTAACTATGGCCTAACGATGCCCATTTTCAATTCTATAATAGGAGAATAAGAATGAAGGAAAGAACAAATGCAACATTGTTGTCCCGGTACAATATCATTTTAAGAGGGTTTGTTTGTTTGTTCGTCTGTAAATTGATGAGATGAACGATGTTATGACCTGGGAATGTCATAACTCATACGACAAGTTCAGGTAGCGCTCGTGGTAGAACTGATTGATGATAAATCACGCGACAGTAGCTGCATTTGAGACGAGAGACTGTGTCATAAATTTGCTGTCGTTCTTGTTTTTAACTTTAACTAATTAACTTAACGTAATTAATGTAATAAACTACCAATGTTGCCAATAGCTAGCTTTAAAAATGCTGGCACTGAGATGATATTAGAGAGATTTACTCTTACATGTTTTATTGCAAACGGCAAACGTGAATTTGTACCACGTGACCAAGTTTTACTCTTACTTGACTTTTACAAAATAAGAGTTGTTTGGGACAGTTTTTATCTGCTCATTTATTATTTTGAATCTACAACTGCCGTAAACGTGACTCTGAAGTTTCGCTACCCCTTTCGACGCCTGCTACGCAGGTTAAAGAAGTACAGTGGAACCCCGCCTTACGGCCACCTCGGTAACATGGTCACCTCGTTATTACGGCCACTTTTTTTTGGCCGCCCGGCTAAACGACCATACATTTTCTTGTAAAAAAACAAACCCTCGTTAATACGGTCACCCGTTAATACGGCCAAATTTTTTTGGCTCATTGGTGACCGTATTAACGGGGTTCTACTGAAAACAAACTTTTCTACTCCGTTCTCTGGAACGGAAAGGGAAACCGATCATCAAGCGAAATGTTATCTTCAATGACTATCCCAACGGGGAGCTTAAAATGATAGAAACTGACTCCTTTAGCAAACCACTGAAAGCTACTTGGATAAATAAGTATTTGGATGAGAAAAACAAGAGTAAATGGAAATTTTTTTTTTTGGCCTAGAATTGCTGAGAATTGGAATCTTTCGGTGGAGAAACTGTCCTCACAGGTAATCTTAACATGACGGACACAAAGACTTTGCTCAAAACAAAAGATAAATTCATCAATGAAGATCTGACAATTTGGGCAGAAGTAAACTTTGAAGTGCAAATCAAATCAGACAATCATTTTCTTAACCAAAGCTTCGCTTATGGCACAACTCGTTGATAAGAATTGACAACTGCCCTTTGTTTTTCCCTGAGTGGCATTGTAAAGGTATCACTAAGGTAAGACACTTGAGACAGGATGACTCAAATAATTTTTTAACTCTCCTTGAATTACAGACTAGTATTGGACTAAAAATCTGCCCTCTTAAATATCGTGGATTGTTGACTGCAAGCTCTCTGGAACGAACATACAGGTAACTTTTCTGCGGACGGCTCTGAATGTCCAATTTTTTTGGAAAGGGTGTCAAAAACACAAAAAGCAAGTCAAATTGTGTACGCTAAACTTATATGAAGAACTGAAAAGTATACTCCCTACACACACTCAGCAGAAATGGATAAAGGAATGCAATATCGAAGGCAATAAACAATAAATGAGGTACGAAGCCCGATCAGCTCCAAAATGTCAGTGGCACCCAACGTCAATTTTCGGAAAATATCTGTTCGGAAGACGATTTGAGAATTTTCGGAACGTTTGTTGTAAAATTTACGGTAAAATTTCTTGCTTGCCTGCCTCTCCTAAGATTTTCGAACATCTAAAAAATGGTATAATTACCGATTTTTAACGGATTTTTACTCAAAAAGGTCTCCTAGAATTTTTGGGAGCATTTTTTCTGGCTGAAATTTTCGAAAAGGTAAGTTTTGATCTTATAATTTTCGGATCACTAGACTTTCAGCTAGGAAATCCGAACAGATGAAAAATTTTTAGGGGATAAAAATATGCCTATATCTACCTTTTAAATACTAAAATACGTTCAACAATGGTATTGTTGGGTGCCCCTGAAATGTACTAAAAGCACAAGTCTGGTAGAATTTCAATTCAGACTTCTACAAAGACGAATATTGACCAACAACTTTTTACCAAAATTGGAATAAACGATGATCCAAACTGCTCCTTTTGTAATGAAGAGCTAGAAAAACTAACACATCTTTTTTGGTCTTGTTTCAAAGTAACTACCTTTTGGAATTCCTTAATCCAACGACTTACTTTATCCCAAATTATCCCACAAAATAAATATTTCTACGAGCCTTGGATTGACACCAGACGCCCCAGTCTAAAAACCATAATAAATAAATTTATGTCTATTATTAACAAGAAATTATATTTGGATATTTTTGTAAAAATAAGAAAGCCTCGCCTAATATACAAGACTTTCTTCAATTCCTTAAATCAAACTATCCACTGGCGAGGAGCTAAACTGAACCTTGTGATGGCCCCAGTGAAAAGTGGGAGCTATTGCAGACTTTAATTTTGAAAGATCTTAGTTTGTTGATGTTGCTCTTGTTTTTGTTTTATTTATTTTTTCTCTTTGCTCTCGTCGGTCCCCTAGATAAATTCGGTTCTATCCTTCCTTCACTGCCTTCTTAAACAAGTTTAACTGCCTTGATAATTTATCTGCATTGCATAGTATCCCAGTATTGTTAGTGTTAAATTGTAAGCTATAATTAGCTAGTTATTGTAAGTGTCAACTTGTAAAGTTATTGTTACTCTCACAACTGGTAGTGTCAACTTATTTGGTCCTGTTTACAAGGAGGGAAGGTAACCCTGGTGCTAGGGTTGTCCTATCACTCTCATATTTCCTCTTTTTTCACACGACGTGTTTACAAGGCAGGTAGGGTTACCCTAGCGCTAGGGTAACCTTACCTGAGTACAGTATTATCCATTTTCTGAGCTTAATTATAGACATCTGAAAAATAAAAAGTTATGTTCTGTCTACCATGATAATATTTCCCTTTTTCCCCTGAATTTAAGGTGTTTTCCATAGAGAACTTCAAGATTATTTCAAATTTTGGACCTTTACAAAGAATGTCACGCATGACGAAACACGTCAGCAAAGCTTGTAAAGTTGACCCGGGGCCCGGTTCCTGAAAGACCGATTAGTGCCAATCCAGGATTAAAATTTTGCTCCACTTTTTGTATTTACCTTCCTATGCATTGTCTAGAGTAGCATTTTGTGTTATAATTTCTTTTTTTTTTCGAAGTAAAGGCACAGCTCGAGTGACTTGTTCTTAGACAAGAAAGCGTTGCTTAAAGTTTGGCTTTATCCTAGGTTAAACTTAACCATCTTTCGAGGAACCGGGCCCCGGGTGATAGGGTAACCCTACCAGTGAAATTTGCTTGTAAACTAGAGCTAATTTTAACCCCCTTGCTAGGGTAACCCTAGCAAAAGGGTAACCCTCTTTCCTTGTAAACAGGCCCTTAGTTAGTTATTGTTAGTACAGTGTCAATTTGTAAACTAGTAGTTACCGTTTGCCAACTTGTAAGCACCGTTAAGGGTATTGTTTGCAATCTGCTGTTTGTTACTATAAAAAAAGTTTTGCAAATAATGTCAATAACTATTAAGTCTACGACAGATTTGTTCCTAACTTTACTAAATAATTCTCAGAGTTATCTAATTTTGTGTGCAAGTTTGAAGTCAATCGAGACTATACAACGCGCTGCACAAAATGCTGGTCAAATTTGTCCTAAAATGCCATGTAAACTTGTGAAAGCCGGTTTTCGTGCATATAATTGAAGATTACCGACAATCTACTAATCCTGCTATAGACTGCAAGCAGTCTCCCCAGGGGGGGGTGGGGGGTACCGCCATATATGGGCTATATAGGTATGTACCGTTGTGAAGGGTATGGTTTTCAAGCAGTTTGCTCTAGGATAGCGTATATAAATCAGAACATTTGGGTCTAGAATAGGAAAGGGTATATAAATCAGAACATTTGGGTCTAGAATAGGGTATCATTTTTCAGGAAACTGATCAGTTGGTTGAAGATTTTATCTAGACAAGGGAAACAGCTACTCTAGGATAGGGGGGATTTGGGGAGTTTACTCTAGTATAGGGTAGCAAAATCTAGCTGAACTAGCTCTGGTGTAGGTTAAGGGTTCCAGGGTCCCAGCGGCACATCCCCAACCAGAAATTCCTAAACTACCCCCAGGACAGTCTCTTATTTTTCTCTGCAAAGTGACCCAATGCACGCGAAACCGAAGCACGCGAGCGGCGAGCGAAGCCGCGATAAACGAAAGCGTAAGCCCGAGACGTCGTGGTTTGCAATCGCGCTGGCTGAGATAAGAACTAGACAGATTTAAGGGAAAGGGCATTTATTTAGTGAGTCGTTTGTTTCTGTCTTTTACCTTATTATTCCTGCCCCGTACCCAGACGTCTCTCTCGTGCGCGCAAAGGAAGGCGGGCCGTCTGTGCCCTTCCCATTGTCCCTTGCGGCTCATCACCAGTCGCTCGCCTCTACCTTGCGAAAAACGAAGCGGCTGAGGAGGTTTTGAGGAGGAGGCTGTTATTGTTTGTTGTTTTGAAGACAAGGCAAGTACTAGTGCCAGTCCCTCCCGAGAACTGAATTTTCAAGATGGTGTCCCCTTGTTATTTTCTAGTTTTATCCCATGGTTTAGCACTTTTTGCCGTTTCAAGATGTTACAGTGGCAGTATTTCGAACTTTTTGATATTTCCTCTACCTGTTGACCATGCTGCCAGCGAAATAGGGAAGAAAAATCATGAAGAGGACTTAAACAAACAAGAAGAGATTGGCCTATTTTCAAGAAAAAGCCAGTTGGTACCTATGCATACAAATTTGAGTTTTCTAATTTCTTCCTGTTCCAACAAGCCAAAATGATTATTAGTGTTTTGGTAACATATATGATAGTAGACTGCAATTTGGCTGTCAAAGTTTTATTATCCTTTTAGTTGTGCGCTACAGATCAATTAAAAAACGACCGATTTTTGGTTCGCTCGCAGACGCTGAGGTTAGGTCAGGTAAAATTAAAAAGTTTTAGTCTTGCCCGTCGCTACCTAACAATGTTTATGGTTTTCGAAGATAGTCCTGTGTTGTTTTGTATCGCATACAAAAAATGAACTTCGCTCGTTTAATTTTAAGGAGTTTAAGAGCTTCAGAAAATGGTCCAAATTTGCATAAAAAATGGCTTCAGAGGGGGCCAAAGTGATGCTCGTATCTCTAATATCTTCTCTAATACAATAGCAGTGAAGAACTTGTGTTTATTATATTTGTCTATTACTCCACCTAACCTATCTATGATGTTAGTATGAGCTTATTCTGTTGACCTCCTTCCGTTGACATTTTTAACATTTAAATTGCCCGGTTTTCTCTGACAACCACTCTTAAGCAATCAATAATTGATCCTGTAGTCAGAGCTCCACCATCACTGACACTATGGTCACCATTTTGCAATTATCCATTCTCCTTCAACAAAGCAGACTTAATGACCACTCAAATGGCGACTTAACAGTATCAAAAAGCGCTTCCAACCAGTTGTGAGTAAAATTTCTGGATAAAGCGAAACCAGACATAAGAATTCTGAATTTGTTTTTCATATTCAATAATTCACAGTTCAAAGCATTTTTCATGTTTGAGTCCAACACAAAAGAAGTTGAATTTAATAGACCTTGTCACGGTTTTCGACGCCATCTTGACGAGTAGGCAAACGTGTGAGAAATAACAGTGATTGTATGAGAATCTAATGTCGGATAATTTCCTTAGAATGGTTGTTCTGAAAAAAATATGATTGTGGATAACACGTGACCATTTTTCCTGATACCGTACTTTGGGCTAATATGGAACGAGGTCTTGAATCCCAAGAAATTCTGCACATTTTTACCACTGAAACTGTGACATTTCCCGGAGGAATCTGACCGTTACGTTTTACGTGTAAAGATGGAAACCAGGTTCCTTGCTGGCGTATTTATTTCTTTAACAGAAGAGACTGTGAAATACCGCCGTCAGGTTAAAATCGTTCAAACGGTCCAGCGCGAAACCGTGGCCACATAGCGACGGGTATGTCAAAAAGCTGAAGAAAGTCAGTTAAAAACGTTTTCTAGCTTGCGCGTCGAGCATTTTCGCTTTCCTTTTTTAAGGCAAAAGCTCATTTAGGCATTGCGATCAGCGAATCAGGCCGCGTTTTCTGAGAGTAGTCATGTAGTCAGAATAGAGTTCATATTAACTCTATCCCGTCCCATGGGAATAATGGACTTTCCTTAATAGAGATTAGAGCCATTATCCACATTACTAGTTTTTCCTTGTTTGCCATTATCCACCGGTATTGCGATGAATAATGAAGTATCTTTAATGGAGATTAGATATTATATTACCTGAGAGTAGTCATGTAGTCAGATTGGAGTTCATGTTACCTCTATCCACATTATTTCTATGAATAATGGACTTTCCTTAATGGAGTTTAGAGCCAATATCCACATTACTAGTTTTTCCTTATTTTCTATCATTAAAAAGTGATATCACATGTTGTAAGTTTCCTGCTGGTTCAACGTCTTTTAACTTTAAAAGTTAGCAAGATATAAAAGAACCTCATTACCTTTCTCATAAAGGAAAAGGTCAAGTCCAAAAATGGTTGTTTTGAAACCTTGTATAACTTGCCGATGATCTAACCGTCCTCGAAATGACGGAAAGTCAAATAAAAAAATATTTAACAATTATTCAATGAGTGCGCGTTGGATATGATGAGGTTGTAAATAGCCAACGAGGCGCGTAGCGCCGAGTTGGCTATAACTGCCTCATATCCAACAAGCGGGAATGAATTAATAGTTTTATTAAACTTTTTTAAACTCCAAACGTCTGGTTATTACAACTGAATTTTAGTAAATTACTAAACGACCGGATATTAATGTAACTTCCGGTAAACCAAGGGAAAATCACATGACATGTGCGCCGCTTGCGAACTTAGTTGAATGAAAAATCTGAAAGGGACGGACGCTGTGCTTTAATTATTCTCGACAAGTACTGTTTTCGCTTTACCTCGCTGTGACTGTGGAATTTGTCGCTGGATAGGTCGACCTGATTGCTCTTCAATTTCCTTTTCTTCCTCCACTGAATCCGAGCTGTAGTGTGTAGTTCATGGTTGTTGCTGTTAAAAGGAAAGCCGCTCGTTGTAGGATTCACTTGAGGCCCGTTATGGACATGCGAAAGTGTTAAGAGATGTAAAAGCTCTTCAAACAAGCAAAAAATCTGCCAGTGGTTTTGAAAACTTACTCCTACAAGGACGTGCTAGCCATTCTGAAAATTTTTAAGGTAAACTTTGTTGTTGTTGTTGTTGTTGTTGTTAAATAATCAGCTCGACGTAAGGAAGAATTAAGCTTATAATTTCTTCGGAGTACCATTAAATAATTTCAAACCAAAGTTTGTAGCTTTTTTGTCGCTGCCGCTATGGCATTTTCTGTTTTTTTTTCTTTTATTTCGTCGGTAGATCTACTTTTTTCCGCCGCTTTGTCCGAGGGAAATTTTGTTGTTTTCCATATTAGGTCAAATCAAGGTACATGAGCAAACATAAAATGATAACCGAGATTGAGTGAACCAATCAGAATGCTAGATATGCATTATCATATTAATGTGTACCTGTGTGGGTGAGCGTGTAAACCCCCATCGCAACTGTTTTCCAGGGAATAAAATGACGGCTTAAAGTTATAAATGATTATGTTCCGCGATGAAAAACACTCGACTTACCAGAGTTAAGACTAATTTCACCAGCGCGCATTGGATCTCTTAGTGATCGTTTTCACTGAAACGCTCCTCACCCCTGGCCTGCGCATACACTTTTCACTATGAATAACTTTCATGCTAAAATTTTGCTAAACAAGCACCGTAGATTTAAGAACGTCTCTTTCTTTTGCTAAGAACGATCATAGCCTGTTGTCGGGAAAACCGTTGTTGAAATACGTCGGTTTTCCGCGACATTGTTTGGGACAAACCATATATGTATCACAAGGGCCATGTTGCTCGCGTGACTTTCAAGAGCTCAATCATAGGAGAAAACAAACAACACTTAAAAAGAGACTCAACAAGAATTACTTGTAAACATAAAAAATCAAAAATCATGATCTTAAAGCATTCACTGAACATCCCACTGGTTTTGTCTAGGACTGAGTGAGAAATATGCAAACCAACAAAGCATTGTTGTTCACTAGGGGGAAAATGACCTAATAAACGCCCGGGGCGTTTATTTAATTTTAGGGATCCAAGCGGAGGCGGTTAATAGATAGGAGGGCTTTTTTATTAAAAGAGAGAGAGAGGGAGACGTTTATTCTCATAACTGTAACAAGCTCAAGAGAACAAATATGCTTTCGGCAAAAATATCAAGAGAGTTTTAAAGTGGTGGAATATCCTGACTCCATCAATTTATACGTCTTGGCAGTCTAGAGATCCATATCACTCTTACAAATCGTAACATCGATGTCAGAATGCTCGCTCAGGGTCATGTGTTGCCATCATTAGTCCACTCTAAAGTTGACATTGAACAAGATGAAATACGCAAAGCCTTGTTAATCAAAGGAGGAAACTAAATGAATTAAAAGATTTTGCTGCTTTTTGCTAATTCCTAGTATTTTGGAGTAATTCTCATAGGGGGCGTCTATTGAAGAGGGGCGTCTAATAAAATTTTCACAGCGTAGAGGGGGCGTTTATTAGATAAGAGGCGTTTATTTGGAAGTGGGAGTTTATTAGGTTTACGGTAGTCTTATTTCATCACTAATTTATAGTTGCACAAATCTGGTTGTACCCTGTTTTGCGTGCGCAAATTTTACCTTAACCCTTAAAATATATTCAGTTGTTCTTGCATCAACGCGAAACCTTTACCGAGTGTAAATATTTTTGATTTTCTTCGTTTTAACCATTCTTTTATTTAAAGTGAATTATCTCCGAAAAGAACTTTTAAAGTGTCTCACCATGTCAAGAAATCCAAGACAGTCTTGAATTCTGGATTCTACGCCGCGGATTCCGGATTCCAGGTACTGGATTCCAGTCTCTTACAGTGGAACTTGGATCGGCTGGATTCCTTGAGTTGTATTCCGGATTCCAAAGCACAGGATTTCGGATTCCGCAAGCAAAACAATTCCCGGATTCTGGAATCCGGATTCCCTTATAAGGGACGAAAAGTGTGAGAGTCTCGTTCCCAGAGGCCGCGATCCTTTTGTACAGCGACGAGGATGGTAGATACTTCATATATGATTCCTATAGTGTACCGATTTGCCTCATCCGCTGGACAAAGGTAACGGAGGCTCTGGGGACGAGACTGAAAGTGTGAGAGTTTCAGATATGCTTAAGCGCAAGATTAGGTGTCATCGGTGGCCTCACGCCAGCACCCCGATAACCTGTATAGATCCTGAATCCAAAAATTTATCAAAGTCAAAACTACTATTTAACACAAAATTACCCATCCCACACAACTAGCACTTTTCAATTCCGATACGGCCTGATGGTTTACACGGAGCTCAAGGTTGATGGTTTTCTGATCCACATGAATTACCTGAGGATGCAATTCGTTAACTAAAACGCGTTTTTACATCCCTCAATAAATAATGTATACTTCTTAATGGAAATTGGAGTTCATATTACGTGTACCGGCAACAAATGGTTTTGTGATGACTGTATCAAATAATTAGATAATTGTCCGGTAGCCGATTAAGAGGTCATATTACCACTGTCCACATCAAAGAGTTCATCAACCCTCGAGAGGTTATTGTTCACTTTGTATTGACGCCAGAAAGAGGGGATACTCAAACACGCGTAAAAGGTCAGTAGTATCTTATTAAGCTGTTATGAAAGCGGAATATACAGATGTACCAACGTTTAATTCAGCCCGGAAAGTCATCACTGAAAATTCGAATTTTACAAAAAAAATTGCTGGATTCATTCCTTAATGGAATGGAGCCTTGAGGTTCGGTTTATATTCGTATTCGGTCAGGGCCACCCAACGATTGTTTTCTGTAAAATATATGTGTGAGGAAGCACATTTTGTCTGGAAATTTTTATAGCTTGAGAAAGGCTTAAGATGATTACCCTATTTCCCTTTCGTATTTGAAAAACACCAGAGTAACCAGAGTTAAGAGTAATTTTACCAGCGCGCATTGGATCTCTCATTGATCGTTTTCACTGAAACGCCTCTCATCCCTGGCCTGCCCATACAGTTTTCAGGCGAAAATTTGCCTAAGCAGTAAGCACCGTAGATATTAGAACGTCTCTTTCTTTTGCTAAGAACTAACATAGCCTGTTGTCGGAAAAAAATGTTGCTGAAATACGTGCTGCTTTTACTTTCCCTGTCTCCTTTAATACTGAACTTTCTGGTTTGCCCTGCATACGCGGCATTGTTGGACAAACCAGCTTGCAATCACTGGGGCCATGTTGCTCGCGTGACTTTCAAGAGTTCAATCATAGGAGAAAACAAGCAGCACTTAAAAAGATATCCAGAGGAATTGCTCAGTTCCGGACATGAAAAATTAAAATCTTAAAGCATTCACTGAACATCCCACCGGTTTTGTTTAGGATTGAGCAATGTGCAAATCAACAAAGCCTTGTTGTTAACAAGGGGGGAAATGATCTAATAAAGGTCCAGGTTAGGGGTCCAAGTGGGGGCGTGTTATAGATAGGAGGCGTTTAAAACAGAGAGGCGTTTATTCTCATAACTGTAACAAGCTCAACAAAGCAAATATGCTTTCGGCGAAAGTATCAAGAGAGTTTAAAATTGGCGGAATTCGAATATCCGCTCCATCAATTGATACGTCTAGGCCGTCTAGAAATCTATATCATTGCACTCTTACATATCCTAACATCGATATCAGACTGCTCGCTTAGGGTGATTGTGTTGCCATCATTAGTCTATCCTCAGTGACTCTAAACTTGACACTGAACAATGCAAGATGAAATAGGCAAATCCTTGTTTGTTAATTGAGGGAGAAAACTAAATGAATGAAAAGATTTTGCAATCCCTACTGAGCCTTTTGGAGTAATTCTCATAAAGGGCGTCAATTAGGGAGGGGCTTCTCATATTTTCACAGCGGAGGGGAGGCGTTTATTTATTTATTTTTGTTTACAGATTAGGTCGATTATCATGTAAGGTTTTTATCGTTGGCCTCCGGCTATCTCTCTCACTGTCAATGGTGATCGCGAGCACCCCGATAACCTATAAAGATGAAATCTGAAACATAACCATCGAGTAAATTCAAACAGGCGTCTCGATTTCTAAATCAAACCCGAAACTATTATATAACACAAAATTACCCATCACTATTCCAATACGGTCTGTTGGTTAACACAGAACTCAAGGTTGATGGTTTTCTGATCCACATGAATTCCCTGTGGATGCAATTCATTAACTAAAAAGCGTTTTTTCATGCCTCAATAAATAATGTATACTTCTTAATGGAAATTGGAGTTCATATTACGTGTACCTGCCACAAATGGTTTTGTGATGACTATATCAAATAATTAGATAATTACCTCAATCAGGAGCTGACCTCAATATAGCCGATTACGAAGTCATATTACCAGTGTCCACATTTAAGATTTCATCAACCCCCTAGAGATTATTGTTCACTTTGTATTGAAGCCAGAAAGCGGGAGGGGATACTCAAACACGCTTAAAGGTTAGTAGTATCTTACTGGTAATAGTTGGCACTACAGCTGCCCCCGCTCTGTATATTGTCGGTTAATAGTATTCTTGCTCGTTATGTAGCTGTTTGAAATAAGCCGATTCATAATGAAGATGTTCACACATAGTCCATACAACAGGTGAGATAAATCTGCGGGGGCAATTAAGCTGTTATGAGGGTGGAATATACAAATGTGCCAACGTTTTATTCAGTCCAGAAAGTCATCACTGGAAATTCGAATTTGGCCACCCGAATAAAATGTTTCGTTTCCCTTCGCCCCGTCTCTTGTGATGGTCGGTCGGTCGGGCAAAAGAAAAGAAAAAAGAATGAACACTTGAAGGGGTCTGCGTTCCAGAGAAGGCCGGGCTGCTAAGTAAGAAAGCCTGGCTACAAAGTCAACTCGAAAACGTGGTCTTTTCGACTATTAAATGTCTAAATGGCCTTTAAAAAACGTAGTATCGATGTTAATTAAAGACAAGGACATCACATCATACCAATTGTTGTTAATAAAACAAGATCGTGTGCTTGTAAATTAAAGTACCCTTTTGAAACTAGAGAAGTCCGAAAATCATTTGGTTTACTTTCGTTCCGTGGTGTGATAACTAATTAAAATTAACCACTGTAATATAACAAAAGCTCGTGATGTGTGTGCATTAGCTATTTGCCTATGGGTAAACGCTGGCTAAAGCTGGCGAGTCCACTTATTGTATCTTCTGAGGCCGTTTCTGAGGAAAAGTCCTTAAACTGTAGCTTAACATAACGATGCCTATATATGGCGAAATTTCGACAGTGGTCCCTGATTGTATGGTTGGTTCTCATTGTGACGTCTTTAGGATACGCCCATATATGACCCAATACAGAGATTCGTGGCAAATGATTCTTAATCACCCAAACTCCATACAGACACCGTCAACTTCACCACCATTTCACAGTTTTCAGTGGTAAAAGTAACTCCTGAATTTAAGCCGGAAGCAATGAGACCAGTTGTTGATTATCTGAAACAAATATTTTCTTTTTTCAGTGCATGACGTTTATTTAGTCAATTCTCCTCTTTATAAAGCAGAGGTGCCCAAAGACGGTTTCCTCCTAAATACATTGAGAACACTCTTTACGTGTCCAGATTACTATTAAGATCTATAGAACCAAGAGAGGATGAAGGGGCCTTTATCCTCTCTTGATAGAACTGAATGCATTCTAAGCGACGCTATGACGTTTGTACCTGTGCGGTCATCCTAGGGGAACTTGAGTCTTGTCGAAATTTCAAGGGCTTCAGTATACATTTTTTGATTCCTCCCTGACTCACCTTCACAGTTTTGAATGCGAAAATTTCAGGTTTCCTTTTTTGAACGAAAAAAAAAAGGCCTAACCTGGGCAGTAAAAAAAACAACAAAATTGTCATTGAAAATCGTAGGGAATTTATTAGAGGCTTCGAAAATTGTACATAAATGGAACGGTCATCTAGATTTTTTTTTTATAAGCGGTTTTCCGAACAAAAATGGTAAACAACAGATGCCGTACATGTACGACATTCTAACTGAATCTATTTTGGTGGCTCCTTAACTTTTAGCCATTAGTTACATATCATTCCATGTTTGTCATCTATTTACATTCGTATACTGTTTTTAAAACCCTTACAGTCTGCTTACGCCGGTTTACTTTTTCCTAATAATTTTGAAAACAAAATTCATTATCAGACTAAGGTTTTACTATAAGGTTTGTTTATAGTGGAAAGTGCACTTAGCTTATCCAGCTAGTTTACAGGCACTCCACTCAAATACTTAGAAACAAATAATTTAAACATAACATAACAGGATTAAAAACCCCACTGGCAGGAGGCAACTAGTTGGCTGTTTACAAGTGTGGCCAAGGATTTCAAATCCAGCAAGTGGTCAGAGCGGGACTCGAACCCAGGACCGCGGGATTGCGAATTTACGCGCTGACCACTCTTCGACATCGGCTTCAGTTTTAAATTCTTATAATTTTACTATCTTAAGATATATTCTATTCGAGAAACGTATGTGTTTCTAACCGAAATATATACATTTCGTATATTCATACGTAGTTTATGTTGCTTTTCCCTCTTGTCTTCACCGCGAGGGATCAATTTGTTGTCTTTTTCCTTTATTTTTCTCAACACGAGAAGCACGAGGACTTGCCGTTGTCTTGGTAAATGATCCTTTTCGGATCTTCCGTTTTTTTGGTAGTGAAGAATCCATATGATCTGAGATCGCAAATCCGTTTTTAGATTCTCCCAAATAGACGCAGAGTCATGTTCAGTGCCGGATCCAGGCCTTGAGATAAGGGTCAGTTCAGAAAAAAATTTATTCGGCCCTTCGAGGCCTCAGCTTGGTCTAAAAATAAGGTGGGGGTGGGGGGGGGGGGGGGGCTGTGGGTAAAAGGTAGGATCCGGTTGTTTTTTTAAAAAAAAAATTGAAAGAAAAAAATAGTCGCCCACTCTTTTGAACTCCAACCCGGTCTTNTAGAATTTTAAAATGAGCTTGGTCAGGGCGGTCATCCTTCTGTTCTCCATTGCGCTGGTAACTCAGTCAATCTTTGGGTCTCCCTTCAAACAAAACAATCATGTCAAAAGGAACCCTGGATCGTCTACTCATGACACTGAGGTAAAATATATTGTCTTTGAAATGAGATACGTACGGTGGGTGTTCAAGTCAATTTTGGTGACTTTCGTTTAAATCGAACATGAAAGTTTCTTTTGGAGGTAATACAGTATGTTGCCCAGTATCCGGACACTTCAGTTTAACATTGCAATAACTTTCCTTTGTTAGTCGCAAGAGCTCTGAAATTTGGCCCAGAGAAAATCTACTTAGTCCTTAATATGACGAAAGACAGTTTAACTTTTTTATCTTTGTTTCCATCGCGAAATTAATATTTTTGCAGAGCTCCTATGTTGCCCAGTATCCGGACAGCTGGCCCAGTGTCCGGACACAACAATAACAACGTAATTGTACATAAATTTCGACAGGCAAGCGACTTATAAGCTTCTCTTGCACTTGCAAAGTAGTCTGGCAATCGATTCCAAAAATATAACCTAGCATCGTGAAATAAAACATAACAAATGACCGGGGTATGGTGGGGAACGCGAGCGGTTGCTAAATGGCATAATTCTCACTTCCTTGTCTAGGAACTTTTTTCCCTGATCATGTAAGGAAGGCATCCATGGACATGCCGAAGCCGCAGTTTCCTAGCTCAATTAGCCAATCTGCAAAAGACTTTTTTTTCCTTCATTTTTTATTTTAAAAAAAAGCTTGCGGAAGGGACCTCAATGCGACAGTCAAAGGTCACTCTCCTATTTATAGTCTGACCTGCGGTGTTGATTTCTTCATGCTCAGTCCCGACAGACCTTACAATAAGCCTGTTTTTCTAGCACTAATTCCTCCTTCTTTCACATCTCTCAATCCCTTTGTGACATTATTTAGACCGTTGCAACCCCATTCTAGCAGTCACAACTGGGGAAAGTATGAGAATTCCAGCTTCAACTCCGACGCTTTTGATTTTTATATGGACAATGCAGTCACGCGAAACAAGCGGCGCACACGAAATCAAGTCGCCTCCGAATGAGTGAAGAAAATTTCCGTACGGAATTGAGTAGAATAACATTTTACGACTACATCATATATCCTAAGCTTTAATTATAGTTATAGATATGAAATAAATCTTATCCGGGTATAGGTATTCATCGATTCTGTACAGCAAAGAAAAAACTGTCCGGATACTGGGCAACCTAACATCAATACTTAGTGAATGTTTTTGGGCTCCGTTATTCCAAACAAATCGAATTTCAAGAAGAATCAATAAAATTTCTGAGAACCTGACTTCTTTAAGTATCTTCACTTTAAATATTAAACCGTTTGTTTGAAAATATCACGTATTACCTACCCTTTTCTGAGCAGCACTAATTTTACCGCGCAGTAAACAGCGTAAATATTAGCCATAAAATGTTTATTCACCTGAACAGAACGGCGTCTACTGCGACTGTTTCGCAAGCTTTCAACTCGCCAAAACCATCATCTTAAAGCTGAATTGTTCAGCTTTCATTCGAAAAACTTCGTTTACCGATAACTGAAAAGGGCGCCTCGAAAATTGAGTTTTTGTTTTAAGAGCCAGTCTCTGTAAGATCCTCATATCGAGTTTTGCCCACAAAATGGATTTCGCTCATAATTTTTCTGTAGACTTCTTTTAATAAGTATATAACAAATATGAAACTAGATTGGAGAAATTCGCTTCTGTAAATTTTTTTTAACGAATTTTCTTTCGGCGCCACATGAGGACCTGAGATGCCTGTGAAATATGCAAATTTTGTCAAAATTCAACCGATTGCTCCGGATCATAGTCCCCTCTACTAACTATGTCCGGATAAAAGAGTTCGACCCAAAGCTTCCAATTTATTAGTTATTTTAATTGAGCCTTTATCTTTACAATAATACAGGTCAGAATCAGCAACACCTTTTCGTTTAGAAAATCTGAGGAAACACACTTAGCCCCTTTGACCCACTTAGCGAAACATACGATTTTAGCCAAATTCAGTGGATTAAATCTCAAAAGTGGCTCACACTTACAGACTCTCCTGCATATCATTGTGTAGTTCAATCCTTCAGCTACTGAATCGTGTTAAAAGTTTTGGCGGCCATTCAGGTTCGGTCGAGGGGTGAGTGGCGAAACTTGCCTTACTTTGCCATTTCGCCGGTGTTTCGCCATCGTGAAGTCCAGAAGGATTGTCAGAATAGAAAGCGAGAAATCGGGTAAATGCCACTCGAAACTTGTCCATTCCTAAAGACTGCAT
gene-Peve_00000001 1382 20 1382 1383
gene-Peve_00000039 2535 1423 2535 2536
gene-Peve_00000112 271 3979 271 272
gene-Peve_00000117 11913 4271 11913 11914
gene-Peve_00000122 35013 16205 35013 35014
gene-Peve_00000008 2351 51239 2351 2352
gene-Peve_00000013 21088 53611 21088 21089
gene-Peve_00000032 7683 74720 7683 7684
gene-Peve_00000035 39241 82424 39241 39242
gene-Peve_00000042 5389 121686 5389 5390
4 Subset Expressed Genes
Only those with at least one read in each sample
4.0.1 Peek at count matrix
head "${count_matrix}" | column -t -s","
echo ""
echo ""
wc -l "${count_matrix}"
gene_id RNA-POR-71 RNA-POR-73 RNA-POR-76 RNA-POR-79 RNA-POR-82
gene-Peve_00006864 0 0 3 263 79
gene-Peve_00022042 439 396 552 640 286
gene-Peve_00009109 89 181 33 293 1789
gene-Peve_00009108 0 0 0 0 0
gene-Peve_00004359 4598 2449 6571 2745 10812
gene-Peve_00004358 147 159 24 155 153
gene-Peve_00004353 158 62 461 123 143
gene-Peve_00009100 179507 81177 154976 83235 211948
gene-Peve_00009103 187 16 885 106 263
40390 ../output/06.2-Peve-Hisat/gene_count_matrix.csv
4.1 Import count matrix
# Read the data into a data frame
<- read.csv(count_matrix, header = TRUE)
count_matrix_df
str(count_matrix_df)
'data.frame': 40389 obs. of 6 variables:
$ gene_id : chr "gene-Peve_00006864" "gene-Peve_00022042" "gene-Peve_00009109" "gene-Peve_00009108" ...
$ RNA.POR.71: int 0 439 89 0 4598 147 158 179507 187 0 ...
$ RNA.POR.73: int 0 396 181 0 2449 159 62 81177 16 17 ...
$ RNA.POR.76: int 3 552 33 0 6571 24 461 154976 885 0 ...
$ RNA.POR.79: int 263 640 293 0 2745 155 123 83235 106 0 ...
$ RNA.POR.82: int 79 286 1789 0 10812 153 143 211948 263 21 ...
4.2 Only genes with at least one read per sample
# Filter rows where all values are greater than 0
<- count_matrix_df[apply(count_matrix_df > 0, 1, all), ]
filtered_count_matrix_df
str(filtered_count_matrix_df)
'data.frame': 17224 obs. of 6 variables:
$ gene_id : chr "gene-Peve_00022042" "gene-Peve_00009109" "gene-Peve_00004359" "gene-Peve_00004358" ...
$ RNA.POR.71: int 439 89 4598 147 158 179507 187 1049 484 128 ...
$ RNA.POR.73: int 396 181 2449 159 62 81177 16 412 15 60 ...
$ RNA.POR.76: int 552 33 6571 24 461 154976 885 1785 460 420 ...
$ RNA.POR.79: int 640 293 2745 155 123 83235 106 433 78 118 ...
$ RNA.POR.82: int 286 1789 10812 153 143 211948 263 1613 151 127 ...
4.3 Subset genes FastA
Only expressed genes
# Get the row names (gene_ids) of the filtered data frame
<- filtered_count_matrix_df$gene_id
filtered_gene_ids
<- readDNAStringSet(genes_fasta)
fasta
<- fasta[names(fasta) %in% filtered_gene_ids] subset_fasta
writeXStringSet(subset_fasta, genes_subset_fasta)
4.3.1 Peek at genes subset FastA
head "${genes_subset_fasta}"
echo ""
grep "^>" --count "${genes_subset_fasta}"
>gene-Peve_00000013
ATGAGCTTGGTCAGGGCGGTCATCCTTCTGTTCTCCATTGCGCTGGTAACTCAGTCAATCTTTGGGTCTCCCTTCAAACA
AAACAATCATGTCAAAAGGAACCCTGGATCGTCTACTCATGACACTGAGGTAAAATATATTGTCTTTGAAATGAGATACG
TACGGTGGGTGTTCAAGTCAATTTTGGTGACTTTCGTTTAAATCGAACATGAAAGTTTCTTTTGGAGGTAATACAGTATG
TTGCCCAGTATCCGGACACTTCAGTTTAACATTGCAATAACTTTCCTTTGTTAGTCGCAAGAGCTCTGAAATTTGGCCCA
GAGAAAATCTACTTAGTCCTTAATATGACGAAAGACAGTTTAACTTTTTTATCTTTGTTTCCATCGCGAAATTAATATTT
TTGCAGAGCTCCTATGTTGCCCAGTATCCGGACAGCTGGCCCAGTGTCCGGACACAACAATAACAACGTAATTGTACATA
AATTTCGACAGGCAAGCGACTTATAAGCTTCTCTTGCACTTGCAAAGTAGTCTGGCAATCGATTCCAAAAATATAACCTA
GCATCGTGAAATAAAACATAACAAATGACCGGGGTATGGTGGGGAACGCGAGCGGTTGCTAAATGGCATAATTCTCACTT
CCTTGTCTAGGAACTTTTTTCCCTGATCATGTAAGGAAGGCATCCATGGACATGCCGAAGCCGCAGTTTCCTAGCTCAAT
17224
5 BLASTx
5.1 Download SwissProt
cd "${blastdbs_dir}"
curl -O https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz
mv uniprot_sprot.fasta.gz 20250618-uniprot_sprot.fasta.gz
gunzip --keep 20250618-uniprot_sprot.fasta.gz
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0 0 88.7M 0 374k 0 0 547k 0 0:02:46 --:--:-- 0:02:46 546k 27 88.7M 27 24.7M 0 0 14.5M 0 0:00:06 0:00:01 0:00:05 14.5M 65 88.7M 65 58.1M 0 0 21.7M 0 0:00:04 0:00:02 0:00:02 21.7M 94 88.7M 94 83.6M 0 0 22.6M 0 0:00:03 0:00:03 --:--:-- 22.6M100 88.7M 100 88.7M 0 0 22.7M 0 0:00:03 0:00:03 --:--:-- 22.7M
5.2 Create BLASTdb
cd "${blastdbs_dir}"
"${diamond}" makedb \
\
--in 20250618-uniprot_sprot.fasta.gz "${diamond_db}" \
--db \
--quiet "${threads}" --threads
5.3 Run DIAMOND BLASTx
"${diamond}" blastx \
"${blastdbs_dir}"/"${diamond_db}.dmnd" \
--db "${genes_subset_fasta}" \
--query "${diamond_output}" \
--out \
--outfmt 6 \
--sensitive \
--evalue 1e-10 \
--max-target-seqs 1 \
--block-size 15.0 \
--index-chunks 4 "${threads}" \
--threads > "${output_dir}"/diamond-blastx.log
2
head "${diamond_output}"
echo ""
wc -l "${diamond_output}"
gene-Peve_00000013 sp|P58691|ACTPA_RADCR 42.0 112 57 4 20741 21073 69 173 1.43e-15 81.6
gene-Peve_00000035 sp|Q9NBX4|RTXE_DROME 24.9 486 325 16 10287 11693 428 890 2.49e-17 95.5
gene-Peve_00000042 sp|P62993|GRB2_HUMAN 51.8 56 27 0 887 720 161 216 3.81e-11 68.2
gene-Peve_00000058 sp|A3KN95|T151B_XENTR 29.2 202 135 5 1732 2322 127 325 1.16e-12 75.5
gene-Peve_00000107 sp|F4IE66|PRP22_ARATH 64.5 76 27 0 10104 9877 41 116 1.08e-20 103
gene-Peve_00000108 sp|Q6P4S8|INT1_MOUSE 51.7 89 42 1 5664 5930 180 267 3.17e-20 105
gene-Peve_00000114 sp|F1R8Z9|FXJ1B_DANRE 73.1 67 14 1 2663 2475 183 249 5.64e-21 102
gene-Peve_00000115 sp|Q9DCV4|RMD1_MOUSE 33.3 147 85 4 8274 7870 71 216 3.40e-13 76.6
gene-Peve_00000116 sp|Q32KL4|RMD1_BOVIN 47.6 84 42 1 8369 8124 85 168 2.34e-12 74.3
gene-Peve_00000121 sp|O50655|XERD_SELRU 25.5 314 209 10 25765 26682 29 325 2.56e-13 79.3
8966 ../output/30.00-Peve-transcriptome-GOslims/Pevermanni-expressed-genes.blastx.outfmt6
6 GENE ONTOLOGY
6.1 Get gene IDs and SwissProt IDs
awk -F"|" '{print $1"\t"$2}' "${diamond_output}" \
| awk '{print $1"\t"$3}' \
> "${output_dir}"/gene-SPIDs.txt
head "${output_dir}"/gene-SPIDs.txt
echo ""
echo ""
wc -l "${output_dir}"/gene-SPIDs.txt
gene-Peve_00000013 P58691
gene-Peve_00000035 Q9NBX4
gene-Peve_00000042 P62993
gene-Peve_00000058 A3KN95
gene-Peve_00000107 F4IE66
gene-Peve_00000108 Q6P4S8
gene-Peve_00000114 F1R8Z9
gene-Peve_00000115 Q9DCV4
gene-Peve_00000116 Q32KL4
gene-Peve_00000121 O50655
8966 ../output/30.00-Peve-transcriptome-GOslims/gene-SPIDs.txt
6.2 Get SwissProt IDs
awk -F"|" '{print $2}' "${diamond_output}" \
| sort --unique \
> "${output_dir}"/SPIDs.txt
head "${output_dir}"/SPIDs.txt
echo ""
echo ""
wc -l "${output_dir}"/SPIDs.txt
A0A096X8J7
A0A0C2SRU0
A0A0G2JZ79
A0A0G2L7I0
A0A0P0XB70
A0A0R3K2G2
A0A0R4I9Y1
A0A0R4IBK5
A0A0R4IES7
A0A0R4ITC5
6137 ../output/30.00-Peve-transcriptome-GOslims/SPIDs.txt
6.3 Retrieve UniProt records
A difference in number of records could be due to retrieval from only “reviewed” records, while BLAST may have included both “reviewed” and “unreviewed.” SwissProt records
python3 \
\
../../M-multi-species/code/uniprot-retrieval.py "${output_dir}"/SPIDs.txt \
"${output_dir}"/
gunzip "${output_dir}"/uniprot-retrieval.tsv.gz
echo ""
echo ""
wc -l "${output_dir}"/uniprot-retrieval.tsv
500 / 500
500 / 500
500 / 500
500 / 500
500 / 500
500 / 500
500 / 500
500 / 500
500 / 500
500 / 500
500 / 500
500 / 500
137 / 137
6149 ../output/30.00-Peve-transcriptome-GOslims/uniprot-retrieval.tsv
6.4 Map GO to Genes
# Read gene-SPIDs.txt
<- read.delim(file.path(output_dir, "gene-SPIDs.txt"), header = FALSE, stringsAsFactors = FALSE)
gene_spids
colnames(gene_spids) <- c("GeneID", "Entry")
# Read uniprot-retrieval.tsv
<- read.delim(
uniprot file.path(output_dir, "uniprot-retrieval.tsv"),
header = TRUE,
stringsAsFactors = FALSE,
check.names = FALSE
)
# Merge on Entry
<- merge(gene_spids, uniprot[, c("Entry", "Gene Ontology IDs")], by = "Entry", all.x = TRUE)
gene_SPID_GOID_merged
# View result
str(gene_SPID_GOID_merged)
'data.frame': 8966 obs. of 3 variables:
$ Entry : chr "A0A096X8J7" "A0A096X8J7" "A0A0C2SRU0" "A0A0C2SRU0" ...
$ GeneID : chr "gene-Peve_00003282" "gene-Peve_00027371" "gene-Peve_00011467" "gene-Peve_00011913" ...
$ Gene Ontology IDs: chr "GO:0005452; GO:0005886; GO:0008509; GO:0015701; GO:0051453; GO:0055085; GO:0086001" "GO:0005452; GO:0005886; GO:0008509; GO:0015701; GO:0051453; GO:0055085; GO:0086001" "GO:0004609; GO:0005739; GO:0006646" "GO:0004609; GO:0005739; GO:0006646" ...
6.4.1 Write merged to file
# Optionally, write to file
write.table(gene_SPID_GOID_merged,
file = file.path(output_dir, "gene-SPIDs-GOIDs.tsv"),
sep = "\t",
row.names = FALSE,
quote = FALSE)
6.5 Clean up merged
<- as.data.frame(gene_SPID_GOID_merged)
full.gene.df
# Clean whitespace, filter NA/empty rows, select columns, and split GO terms using column name variables
<- full.gene.df %>%
gene.GO.df mutate(!!"Gene Ontology IDs" := str_replace_all(.data[["Gene Ontology IDs"]], "\\s*;\\s*", ";")) %>% # Clean up spaces around ";"
filter(!is.na(.data[["GeneID"]]) & !is.na(.data[["Gene Ontology IDs"]]) & .data[["Gene Ontology IDs"]] != "") %>%
::select(all_of(c("GeneID", "Gene Ontology IDs")))
dplyr
str(gene.GO.df)
'data.frame': 8781 obs. of 2 variables:
$ GeneID : chr "gene-Peve_00003282" "gene-Peve_00027371" "gene-Peve_00011467" "gene-Peve_00011913" ...
$ Gene Ontology IDs: chr "GO:0005452;GO:0005886;GO:0008509;GO:0015701;GO:0051453;GO:0055085;GO:0086001" "GO:0005452;GO:0005886;GO:0008509;GO:0015701;GO:0051453;GO:0055085;GO:0086001" "GO:0004609;GO:0005739;GO:0006646" "GO:0004609;GO:0005739;GO:0006646" ...
6.6 Flatten gene GOID file
<- gene.GO.df %>% separate_rows(!!sym("Gene Ontology IDs"), sep = ";")
flat.gene.GO.df
str(flat.gene.GO.df)
tibble [106,348 × 2] (S3: tbl_df/tbl/data.frame)
$ GeneID : chr [1:106348] "gene-Peve_00003282" "gene-Peve_00003282" "gene-Peve_00003282" "gene-Peve_00003282" ...
$ Gene Ontology IDs: chr [1:106348] "GO:0005452" "GO:0005886" "GO:0008509" "GO:0015701" ...
6.7 Group by GOID
<- flat.gene.GO.df %>%
grouped.gene.GO.df group_by(!!sym("Gene Ontology IDs")) %>%
summarise(!!"GeneID" := paste(.data[["GeneID"]], collapse = ","))
str(grouped.gene.GO.df)
tibble [12,132 × 2] (S3: tbl_df/tbl/data.frame)
$ Gene Ontology IDs: chr [1:12132] "GO:0000002" "GO:0000009" "GO:0000012" "GO:0000014" ...
$ GeneID : chr [1:12132] "gene-Peve_00012113,gene-Peve_00027057,gene-Peve_00036001,gene-Peve_00043435,gene-Peve_00020141,gene-Peve_000383"| __truncated__ "gene-Peve_00020421" "gene-Peve_00036938,gene-Peve_00028780,gene-Peve_00013011,gene-Peve_00032733" "gene-Peve_00023146,gene-Peve_00032460,gene-Peve_00008105,gene-Peve_00013484,gene-Peve_00013490,gene-Peve_000098"| __truncated__ ...
6.8 Vectorize GOIDs
# Vector of GO IDs
<- grouped.gene.GO.df[["Gene Ontology IDs"]]
go_ids
str(go_ids)
chr [1:12132] "GO:0000002" "GO:0000009" "GO:0000012" "GO:0000014" ...
6.9 Prepare GOslim OBO
# Find GSEAbase installation location
<- find.package("GSEABase")
gseabase_location
# Load path to GOslim OBO file
<- file.path(gseabase_location, "extdata", goslims_obo, fsep = "/")
goslim_obo_destintation
# Download the GOslim OBO file
download.file(url = goslims_url, destfile = goslim_obo_destintation)
# Loads package files
<- system.file("extdata", goslims_obo, package="GSEABase") gseabase_files
6.10 GOslims from OBO
# Create GSEAbase GOCollection using `go_ids`
<- GOCollection(go_ids)
myCollection
# Retrieve GOslims from GO OBO file set
<- getOBOCollection(gseabase_files)
slim
str(slim)
Formal class 'OBOCollection' [package "GSEABase"] with 7 slots
..@ .stanza :'data.frame': 153 obs. of 1 variable:
.. ..$ value: chr [1:153] "Root" "Term" "Term" "Term" ...
..@ .subset :'data.frame': 22 obs. of 1 variable:
.. ..$ value: chr [1:22] "Rhea list of ChEBI terms representing the major species at pH 7.3." "Term not to be used for direct annotation" "Terms planned for obsoletion" "AGR slim" ...
..@ .kv :'data.frame': 2110 obs. of 3 variables:
.. ..$ stanza_id: chr [1:2110] ".__Root__" ".__Root__" ".__Root__" ".__Root__" ...
.. ..$ key : chr [1:2110] "format-version" "data-version" "synonymtypedef" "synonymtypedef" ...
.. ..$ value : chr [1:2110] "1.2" "go/releases/2025-06-01/subsets/goslim_generic.owl" "syngo_official_label \"label approved by the SynGO project\"" "systematic_synonym \"Systematic synonym\" EXACT" ...
..@ evidenceCode: chr [1:26] "EXP" "IDA" "IPI" "IMP" ...
..@ ontology : chr NA
..@ ids : chr [1:141] "GO:0000228" "GO:0000278" "GO:0000910" "GO:0001618" ...
..@ type : chr "OBO"
6.11 Biological Process GOslims
# Retrieve Biological Process (BP) GOslims
<- goSlim(myCollection, slim, "BP", verbose)
slimdf
str(slimdf)
'data.frame': 72 obs. of 3 variables:
$ Count : int 86 19 19 417 51 85 20 4 88 55 ...
$ Percent: num 1.077 0.238 0.238 5.224 0.639 ...
$ Term : chr "mitotic cell cycle" "cytokinesis" "cytoplasmic translation" "immune system process" ...
6.12 Map GO to GOslims
# List of GOslims and all GO IDs from `go_ids`
<- as.list(GOBPOFFSPRING[rownames(slimdf)])
gomap
# Maps `go_ids` to matching GOslims
<- lapply(gomap, intersect, ids(myCollection))
mapped
# Append all mapped GO IDs to `slimdf`
# `sapply` needed to apply paste() to create semi-colon delimited values
$GO.IDs <- sapply(lapply(gomap, intersect, ids(myCollection)), paste, collapse=";")
slimdf
# Remove "character(0) string from "GO.IDs" column
$GO.IDs[slimdf$GO.IDs == "character(0)"] <- ""
slimdf
# Add self-matching GOIDs to "GO.IDs" column, if not present
for (go_id in go_ids) {
# Check if the go_id is present in the row names
if (go_id %in% rownames(slimdf)) {
# Check if the go_id is not present in the GO.IDs column
# Also removes white space "trimws()" and converts all to upper case to handle
# any weird, "invisible" formatting issues.
if (!go_id %in% trimws(toupper(strsplit(slimdf[go_id, "GO.IDs"], ";")[[1]]))) {
# Append the go_id to the GO.IDs column with a semi-colon separator
if (length(slimdf$GO.IDs) > 0 && nchar(slimdf$GO.IDs[nrow(slimdf)]) > 0) {
"GO.IDs"] <- paste0(slimdf[go_id, "GO.IDs"], "; ", go_id)
slimdf[go_id, else {
} "GO.IDs"] <- go_id
slimdf[go_id,
}
}
}
}
str(slimdf)
'data.frame': 72 obs. of 4 variables:
$ Count : int 86 19 19 417 51 85 20 4 88 55 ...
$ Percent: num 1.077 0.238 0.238 5.224 0.639 ...
$ Term : chr "mitotic cell cycle" "cytokinesis" "cytoplasmic translation" "immune system process" ...
$ GO.IDs : chr "GO:0000022;GO:0000070;GO:0000082;GO:0000086;GO:0000132;GO:0000281;GO:0007052;GO:0007064;GO:0007076;GO:0007079;G"| __truncated__ "GO:0000281;GO:0000915;GO:0007110;GO:0007111;GO:0007112;GO:0032465;GO:0032466;GO:0032467;GO:0036089;GO:0036090;G"| __truncated__ "GO:0001731;GO:0001732;GO:0002182;GO:0002183;GO:0002184;GO:0002188;GO:0002191;GO:0017183;GO:0140708;GO:1900248;G"| __truncated__ "GO:0001771;GO:0001774;GO:0001776;GO:0001777;GO:0001779;GO:0001780;GO:0001782;GO:0001805;GO:0001807;GO:0001812;G"| __truncated__ ...
6.13 Flatten GOslims
# "Flatten" file so each row is single GO ID with corresponding GOslim
# rownames_to_column needed to retain row name info
<- as.data.frame(slimdf %>%
slimdf_separated rownames_to_column('GOslim') %>%
separate_rows(GO.IDs, sep = ";"))
# Group by unique GO ID
<- slimdf_separated %>%
grouped_slimdf filter(!is.na(GO.IDs) & GO.IDs != "") %>%
group_by(GO.IDs) %>%
summarize(GOslim = paste(GOslim, collapse = ";"),
Term = paste(Term, collapse = ";"))
str(grouped_slimdf)
tibble [5,531 × 3] (S3: tbl_df/tbl/data.frame)
$ GO.IDs: chr [1:5531] " GO:0000278" " GO:0002181" " GO:0002376" " GO:0003014" ...
$ GOslim: chr [1:5531] "GO:0000278" "GO:0002181" "GO:0002376" "GO:0003014" ...
$ Term : chr [1:5531] "mitotic cell cycle" "cytoplasmic translation" "immune system process" "renal system process" ...
6.14 Counts of GOslims
<- slimdf %>% arrange(desc(Count))
slimdf.sorted
<- slimdf.sorted %>%
slim.count.df ::select(Term, Count, Percent)
dplyr
str(slim.count.df)
'data.frame': 72 obs. of 3 variables:
$ Term : chr "anatomical structure development" "signaling" "cell differentiation" "immune system process" ...
$ Count : int 1696 727 714 417 273 257 243 206 198 186 ...
$ Percent: num 21.25 9.11 8.94 5.22 3.42 ...
6.14.1 Write GOslims to file
Need to create a column name for GOslimIDs from data frame rownames.
# Create header vector
<- c("GOslimID", colnames(slim.count.df))
header
# Write header to file
writeLines(paste(header, collapse = "\t"),
file.path(output_dir, "GOslim-counts.tsv"))
# Append data frame contents to existing file, which contains header info
write.table(
slim.count.df,file = file.path(output_dir, "GOslim-counts.tsv"),
sep = "\t",
row.names = TRUE,
quote = FALSE,
col.names = FALSE,
append = TRUE
)
RESULTS
Multispecies barplots were created of the top 10 GOslims with the highest GO term counts.
Plots were generated with the following code:
# Define file paths and species names
<- list(
files "D-Apul" = "~/gitrepos/urol-e5/deep-dive-expression/D-Apul/output/30.00-Apul-transcriptome-GOslims/GOslim-counts.tsv",
"E-Peve" = "~/gitrepos/urol-e5/deep-dive-expression/E-Peve/output/30.00-Peve-transcriptome-GOslims/GOslim-counts.tsv",
"F-Ptuh" = "~/gitrepos/urol-e5/deep-dive-expression/F-Ptuh/output/30.00-Ptua-transcriptome-GOslims/GOslim-counts.tsv"
)
# Read and combine data
<- map2_dfr(
goslim_data
files,names(files),
~ read_tsv(.x, show_col_types = FALSE) %>%
mutate(Species = .y)
)
# Find the top 10 Terms by Count for each species
<- goslim_data %>%
top_terms group_by(Species) %>%
slice_max(order_by = Count, n = 10, with_ties = FALSE) %>%
ungroup() %>%
distinct(Term)
# Filter the data to only include these Terms
<- goslim_data %>%
goslim_top filter(Term %in% top_terms$Term)
# Plot: Term on Y, Count on X
ggplot(goslim_top, aes(y = fct_reorder(Term, Count), x = Count, fill = Species)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.8)) +
labs(
title = "Top GOslim Term Counts by Species",
y = "GOslim Term",
x = "Count"
+
) theme_bw() +
theme(
axis.text.y = element_text(size = 8),
plot.title = element_text(hjust = 0.5)
)