This notebook consists of a markdown rendering of 01-GO-to-GOslim.Rmd (commit: 04a796d
).
1 INTRO
This notebook is setup to take GO IDs and categorize them into their corresponding GOslims (GitHub Issue). Specifically, it will use the following tab-delimited input file:
Desired output format (GitHub Issue comment) is:
Term Count
anatomical structure development 1137
signaling 600
cell differentiation 533
immune system process 331
lipid metabolic process 212
This was performed using R, with the following packages:
- GSEABase (Martin Morgan 2017)
- GO.db (Carlson 2017)
- tidyverse (Wickham et al. 2019)
It also relied on gene ontology (GO) information from the Gene Ontology Consortium (Ashburner et al. 2000; Aleksander et al. 2023).
1.1 CODE
library(GSEABase)
library(GO.db)
##
library(knitr)
library(tidyverse)
::opts_chunk$set(
knitrecho = TRUE, # Display code chunks
eval = FALSE, # Evaluate code chunks
warning = FALSE, # Hide warnings
message = FALSE, # Hide messages
comment = "" # Prevents appending '##' to beginning of lines in code output
)
1.1.1 Variables
# Column names corresponding to gene name/ID and GO IDs
<- "Gene.Ontology.IDs"
GO.ID.column <- "X1"
gene.ID.column
# Relative path or URL to input file
<- "https://gannet.fish.washington.edu/seashell/bu-github/project-pycno-multispecies-2023/output/06-Go-3-species/der_go.tsv"
input.file
##### Official GO info - no need to change #####
<- "goslim_generic.obo"
goslims_obo <- "http://current.geneontology.org/ontology/subsets/goslim_generic.obo" goslims_url
1.1.2 Set GSEAbase location and download goslim_generic.obo
# Find GSEAbase installation location
<- find.package("GSEABase")
gseabase_location
# Load path to GOslim OBO file
<- file.path(gseabase_location, "extdata", goslims_obo, fsep = "/")
goslim_obo_destintation
# Download the GOslim OBO file
download.file(url = goslims_url, destfile = goslim_obo_destintation)
# Loads package files
<- system.file("extdata", goslims_obo, package="GSEABase") gseabase_files
1.1.3 Read in gene/GO file
<- read.csv(file = input.file, header = TRUE, sep = "\t")
full.gene.df
str(full.gene.df)
'data.frame': 34254 obs. of 25 variables:
$ X1 : chr "TRINITY_DN37009_c0_g3_i1" "TRINITY_DN37009_c0_g3_i3" "TRINITY_DN37035_c0_g1_i1" "TRINITY_DN37023_c0_g1_i1" ...
$ X2 : chr "sp" "sp" "sp" "sp" ...
$ X3 : chr "Q9BYN0" "Q9BYN0" "P21329" "P07700" ...
$ X4 : chr "SRXN1_HUMAN" "SRXN1_HUMAN" "RTJK_DROFU" "ADRB1_MELGA" ...
$ X5 : num 57.6 57.6 28.6 33.3 31.6 ...
$ X6 : int 125 125 147 99 133 49 90 179 85 37 ...
$ X7 : int 47 47 92 65 87 21 47 105 44 13 ...
$ X8 : int 2 2 4 1 3 0 3 7 2 1 ...
$ X9 : int 2822 1720 169 47 473 85 15 1 104 102 ...
$ X10 : int 2448 1346 579 340 84 231 275 513 343 4 ...
$ X11 : int 18 18 613 59 44 374 425 85 64 1097 ...
$ X12 : int 136 136 756 157 175 422 509 246 148 1133 ...
$ X13 : num 1.11e-39 3.47e-40 4.94e-08 6.59e-09 9.86e-12 ...
$ X14 : num 147 147 55.8 55.5 66.6 58.9 49.7 51.6 69.3 46.2 ...
$ Reviewed : chr "reviewed" "reviewed" NA "reviewed" ...
$ Entry.Name : chr "SRXN1_HUMAN" "SRXN1_HUMAN" NA "ADRB1_MELGA" ...
$ Protein.names : chr "Sulfiredoxin-1 (EC 1.8.98.2)" "Sulfiredoxin-1 (EC 1.8.98.2)" NA "Beta-1 adrenergic receptor (Beta-1 adrenoreceptor) (Beta-1 adrenoceptor) (Beta-T)" ...
$ Gene.Names : chr "SRXN1 C20orf139 SRX SRX1" "SRXN1 C20orf139 SRX SRX1" NA "ADRB1" ...
$ Organism : chr "Homo sapiens (Human)" "Homo sapiens (Human)" NA "Meleagris gallopavo (Wild turkey)" ...
$ Length : int 137 137 NA 483 483 1056 NA 364 640 1358 ...
$ Gene.Ontology..biological.process.: chr "cellular response to oxidative stress [GO:0034599]; response to oxidative stress [GO:0006979]" "cellular response to oxidative stress [GO:0034599]; response to oxidative stress [GO:0006979]" NA "adenylate cyclase-activating adrenergic receptor signaling pathway [GO:0071880]; positive regulation of heart c"| __truncated__ ...
$ Gene.Ontology..cellular.component.: chr "cytoplasm [GO:0005737]; cytosol [GO:0005829]; endoplasmic reticulum membrane [GO:0005789]" "cytoplasm [GO:0005737]; cytosol [GO:0005829]; endoplasmic reticulum membrane [GO:0005789]" NA "early endosome [GO:0005769]; membrane [GO:0016020]; plasma membrane [GO:0005886]" ...
$ Gene.Ontology..GO. : chr "cytoplasm [GO:0005737]; cytosol [GO:0005829]; endoplasmic reticulum membrane [GO:0005789]; ATP binding [GO:0005"| __truncated__ "cytoplasm [GO:0005737]; cytosol [GO:0005829]; endoplasmic reticulum membrane [GO:0005789]; ATP binding [GO:0005"| __truncated__ NA "early endosome [GO:0005769]; membrane [GO:0016020]; plasma membrane [GO:0005886]; beta1-adrenergic receptor act"| __truncated__ ...
$ Gene.Ontology..molecular.function.: chr "ATP binding [GO:0005524]; oxidoreductase activity, acting on a sulfur group of donors [GO:0016667]; sulfiredoxi"| __truncated__ "ATP binding [GO:0005524]; oxidoreductase activity, acting on a sulfur group of donors [GO:0016667]; sulfiredoxi"| __truncated__ NA "beta1-adrenergic receptor activity [GO:0004940]; identical protein binding [GO:0042802]" ...
$ Gene.Ontology.IDs : chr "GO:0005524; GO:0005737; GO:0005789; GO:0005829; GO:0006979; GO:0016667; GO:0032542; GO:0034599" "GO:0005524; GO:0005737; GO:0005789; GO:0005829; GO:0006979; GO:0016667; GO:0032542; GO:0034599" NA "GO:0004940; GO:0005769; GO:0005886; GO:0016020; GO:0042802; GO:0045187; GO:0045823; GO:0071880" ...
1.1.4 Remove rows with NA, remove whitespace in GO IDs column and keep just gene/GO IDs columns
# Clean whitespace, filter NA/empty rows, select columns, and split GO terms using column name variables
<- full.gene.df %>%
gene.GO.df mutate(!!GO.ID.column := str_replace_all(.data[[GO.ID.column]], "\\s*;\\s*", ";")) %>% # Clean up spaces around ";"
filter(!is.na(.data[[gene.ID.column]]) & !is.na(.data[[GO.ID.column]]) & .data[[GO.ID.column]] != "") %>%
select(all_of(c(gene.ID.column, GO.ID.column)))
str(gene.GO.df)
'data.frame': 23192 obs. of 2 variables:
$ X1 : chr "TRINITY_DN37009_c0_g3_i1" "TRINITY_DN37009_c0_g3_i3" "TRINITY_DN37023_c0_g1_i1" "TRINITY_DN37023_c1_g1_i1" ...
$ Gene.Ontology.IDs: chr "GO:0005524;GO:0005737;GO:0005789;GO:0005829;GO:0006979;GO:0016667;GO:0032542;GO:0034599" "GO:0005524;GO:0005737;GO:0005789;GO:0005829;GO:0006979;GO:0016667;GO:0032542;GO:0034599" "GO:0004940;GO:0005769;GO:0005886;GO:0016020;GO:0042802;GO:0045187;GO:0045823;GO:0071880" "GO:0004940;GO:0005769;GO:0005886;GO:0016020;GO:0042802;GO:0045187;GO:0045823;GO:0071880" ...
1.1.5 “Flatten” gene and GO IDs
This flattens the file so all of the GO IDs per gene are separated into one GO ID per gene per row.
<- gene.GO.df %>% separate_rows(!!sym(GO.ID.column), sep = ";")
flat.gene.GO.df
str(flat.gene.GO.df)
tibble [363,990 × 2] (S3: tbl_df/tbl/data.frame)
$ X1 : chr [1:363990] "TRINITY_DN37009_c0_g3_i1" "TRINITY_DN37009_c0_g3_i1" "TRINITY_DN37009_c0_g3_i1" "TRINITY_DN37009_c0_g3_i1" ...
$ Gene.Ontology.IDs: chr [1:363990] "GO:0005524" "GO:0005737" "GO:0005789" "GO:0005829" ...
1.1.6 Group
Groups the genes by GO ID (i.e. lists all genes associated with each unique GO ID)
<- flat.gene.GO.df %>%
grouped.gene.GO.df group_by(!!sym(GO.ID.column)) %>%
summarise(!!gene.ID.column := paste(.data[[gene.ID.column]], collapse = ","))
str(grouped.gene.GO.df)
tibble [15,068 × 2] (S3: tbl_df/tbl/data.frame)
$ Gene.Ontology.IDs: chr [1:15068] "GO:0000002" "GO:0000009" "GO:0000012" "GO:0000014" ...
$ X1 : chr [1:15068] "TRINITY_DN12635_c0_g1_i1,TRINITY_DN6578_c1_g1_i2,TRINITY_DN20059_c0_g1_i11,TRINITY_DN20059_c0_g1_i13,TRINITY_DN"| __truncated__ "TRINITY_DN3933_c0_g3_i1" "TRINITY_DN2179_c0_g7_i1,TRINITY_DN2179_c0_g7_i2,TRINITY_DN8923_c0_g2_i1,TRINITY_DN8283_c0_g1_i3,TRINITY_DN8283_"| __truncated__ "TRINITY_DN31365_c0_g1_i1,TRINITY_DN31365_c0_g1_i2,TRINITY_DN53840_c0_g1_i1,TRINITY_DN662_c0_g1_i2,TRINITY_DN149"| __truncated__ ...
1.1.7 Map GO IDs to GOslims
The mapping steps were derived from this bioconductor forum response
# Vector of GO IDs
<- grouped.gene.GO.df[[GO.ID.column]]
go_ids
str(go_ids)
chr [1:15068] "GO:0000002" "GO:0000009" "GO:0000012" "GO:0000014" ...
1.1.8 Extract GOslims from OBO
Creates new OBO Collection object of just GOslims, based on provided GO IDs.
# Create GSEAbase GOCollection using `go_ids`
<- GOCollection(go_ids)
myCollection
# Retrieve GOslims from GO OBO file set
<- getOBOCollection(gseabase_files)
slim
str(slim)
Formal class 'OBOCollection' [package "GSEABase"] with 7 slots
..@ .stanza :'data.frame': 153 obs. of 1 variable:
.. ..$ value: chr [1:153] "Root" "Term" "Term" "Term" ...
..@ .subset :'data.frame': 22 obs. of 1 variable:
.. ..$ value: chr [1:22] "Rhea list of ChEBI terms representing the major species at pH 7.3." "Term not to be used for direct annotation" "Terms planned for obsoletion" "AGR slim" ...
..@ .kv :'data.frame': 2075 obs. of 3 variables:
.. ..$ stanza_id: chr [1:2075] ".__Root__" ".__Root__" ".__Root__" ".__Root__" ...
.. ..$ key : chr [1:2075] "format-version" "data-version" "synonymtypedef" "synonymtypedef" ...
.. ..$ value : chr [1:2075] "1.2" "go/2024-09-08/subsets/goslim_generic.owl" "syngo_official_label \"label approved by the SynGO project\"" "systematic_synonym \"Systematic synonym\" EXACT" ...
..@ evidenceCode: chr [1:26] "EXP" "IDA" "IPI" "IMP" ...
..@ ontology : chr NA
..@ ids : chr [1:141] "GO:0000228" "GO:0000278" "GO:0000910" "GO:0001618" ...
..@ type : chr "OBO"
1.1.9 Get Biological Process (BP) GOslims associated with provided GO IDs.
# Retrieve Biological Process (BP) GOslims
<- goSlim(myCollection, slim, "BP", verbose)
slimdf str(slimdf)
'data.frame': 72 obs. of 3 variables:
$ Count : int 93 22 18 554 81 107 31 5 103 58 ...
$ Percent: num 0.925 0.219 0.179 5.508 0.805 ...
$ Term : chr "mitotic cell cycle" "cytokinesis" "cytoplasmic translation" "immune system process" ...
1.1.10 Map GO to GOslims
Performs mapping of of GOIDs to GOslims
Returns:
- GOslim IDs (as rownames)
- GOslim terms
- Counts of GO IDs matching to corresponding GOslim
- Percentage of GO IDs matching to corresponding GOslim
- GOIDs mapped to corresponding GOslim, in a semi-colon delimited format
# List of GOslims and all GO IDs from `go_ids`
<- as.list(GOBPOFFSPRING[rownames(slimdf)])
gomap
# Maps `go_ids` to matching GOslims
<- lapply(gomap, intersect, ids(myCollection))
mapped
# Append all mapped GO IDs to `slimdf`
# `sapply` needed to apply paste() to create semi-colon delimited values
$GO.IDs <- sapply(lapply(gomap, intersect, ids(myCollection)), paste, collapse=";")
slimdf
# Remove "character(0) string from "GO.IDs" column
$GO.IDs[slimdf$GO.IDs == "character(0)"] <- ""
slimdf
# Add self-matching GOIDs to "GO.IDs" column, if not present
for (go_id in go_ids) {
# Check if the go_id is present in the row names
if (go_id %in% rownames(slimdf)) {
# Check if the go_id is not present in the GO.IDs column
# Also removes white space "trimws()" and converts all to upper case to handle
# any weird, "invisible" formatting issues.
if (!go_id %in% trimws(toupper(strsplit(slimdf[go_id, "GO.IDs"], ";")[[1]]))) {
# Append the go_id to the GO.IDs column with a semi-colon separator
if (length(slimdf$GO.IDs) > 0 && nchar(slimdf$GO.IDs[nrow(slimdf)]) > 0) {
"GO.IDs"] <- paste0(slimdf[go_id, "GO.IDs"], "; ", go_id)
slimdf[go_id, else {
} "GO.IDs"] <- go_id
slimdf[go_id,
}
}
}
}
str(slimdf)
'data.frame': 72 obs. of 4 variables:
$ Count : int 93 22 18 554 81 107 31 5 103 58 ...
$ Percent: num 0.925 0.219 0.179 5.508 0.805 ...
$ Term : chr "mitotic cell cycle" "cytokinesis" "cytoplasmic translation" "immune system process" ...
$ GO.IDs : chr "GO:0000022;GO:0000070;GO:0000082;GO:0000086;GO:0000132;GO:0000281;GO:0006977;GO:0007052;GO:0007076;GO:0007079;G"| __truncated__ "GO:0000281;GO:0000911;GO:0000915;GO:0007112;GO:0031991;GO:0032465;GO:0032466;GO:0032467;GO:0032506;GO:0033206;G"| __truncated__ "GO:0001731;GO:0001732;GO:0002183;GO:0002184;GO:0002188;GO:0002190;GO:0002191;GO:0002192;GO:0140018;GO:0140708;G"| __truncated__ "GO:0001768;GO:0001771;GO:0001773;GO:0001774;GO:0001776;GO:0001779;GO:0001780;GO:0001782;GO:0001798;GO:0001805;G"| __truncated__ ...
1.1.11 Flatten GOslims
# "Flatten" file so each row is single GO ID with corresponding GOslim
# rownames_to_column needed to retain row name info
<- as.data.frame(slimdf %>%
slimdf_separated rownames_to_column('GOslim') %>%
separate_rows(GO.IDs, sep = ";"))
# Group by unique GO ID
<- slimdf_separated %>%
grouped_slimdf filter(!is.na(GO.IDs) & GO.IDs != "") %>%
group_by(GO.IDs) %>%
summarize(GOslim = paste(GOslim, collapse = ";"),
Term = paste(Term, collapse = ";"))
str(grouped_slimdf)
tibble [6,942 × 3] (S3: tbl_df/tbl/data.frame)
$ GO.IDs: chr [1:6942] " GO:0000278" " GO:0002181" " GO:0002376" " GO:0003014" ...
$ GOslim: chr [1:6942] "GO:0000278" "GO:0002181" "GO:0002376" "GO:0003014" ...
$ Term : chr [1:6942] "mitotic cell cycle" "cytoplasmic translation" "immune system process" "renal system process" ...
1.2 Write slimdf to file
1.2.1 Sort and select slmidf
Sorts GOslims by Count
, in descending order and then selects just the Term
and Count
columns.
<- slimdf %>% arrange(desc(Count))
slimdf.sorted
<- slimdf.sorted %>%
slim.count.df select(Term, Count)
str(slim.count.df)
'data.frame': 72 obs. of 2 variables:
$ Term : chr "anatomical structure development" "cell differentiation" "signaling" "immune system process" ...
$ Count: int 2091 932 908 554 352 306 278 275 264 246 ...
1.2.2 Write formatted slim.count.df to file
write_tsv(slim.count.df, file = "../output/01-GO-to-GOslim/counts.GOID-per-GOslim_term.tab")
1.3 Count unique Biological Process GO IDs
# Flatten the list and count total GO IDs
<- length(unlist(gomap))
total_go_ids
# Display the total count
total_go_ids
[1] 26534
# Unlist to extract all GO IDs, then find unique ones and count them
<- unique(unlist(gomap))
unique_go_ids <- length(unique_go_ids)
total_unique_ids
# Display the total count of unique GO IDs
total_unique_ids
[1] 17913
Total starting BP GO IDs: 26534
Total unique BP GO IDs: 17913