Ivan Tomic info@ivantomic.com
The goal of genular
is to provide a comprehensive
toolkit for interacting with the Genular API,
facilitating the retrieval and analysis of genomic data directly within
R.
You can install the released version of genular
from CRAN with:
install.packages("genular")
Or you can install genular
directly from GitHub with use
of following commands:
install.packages("devtools")
::install_github("atomiclaboratory/genular-database", subdir = 'libraries/genular-api/R-package') devtools
# Check if plyr is installed and install it if necessary
if (!requireNamespace("plyr", quietly = TRUE)) {
install.packages("plyr")
}if (!requireNamespace("dplyr", quietly = TRUE)) {
install.packages("dplyr")
}
# Load the genular r-package
library(genular)
# Your personal API Key
# (request here or use your own installation: https://genular.atomic-lab.org/contact)
<- "3147dda5fa023c9763d38bddb472dd28"
API_KEY # If using own installation add custom endpoint variable in options, refer to manual
# ENDPOINT = "https://localhost/api/v1/cells/suggest",
# This example demonstrates how to suggest relevant cell types based on provided keywords. We use the `cells_suggest()` function to find cell types related to specific terms like "endothelial cell" and "T cell".
# Define the query values for cell type suggestions
<- c("endothelial cell", "T cell")
queryValues # Use the cells_suggest function to get relevant cell type matches
<- cells_suggest(queryValues,
cell_suggest_results options = list(api_key = API_KEY, timeout = 5000)
)# Display the suggested cell types
# print(head(cell_suggest_results))
# In this example, we demonstrate how to search for specific cell information based on conditions applied to multiple cell IDs. The `cells_search()` function is used to retrieve data about cells that meet the specified criteria.
# Define query conditions for different cell IDs
<- list(
queryValues "CL0002350" = ">= 565",
"CL2000041" = ">= 323",
"CL0000129" = ">= 1821",
"CL0000453" = ">= 299"
)
# Define the fields to filter in the search results
<- c(
fieldsFilter "geneID",
"symbol",
"crossReference.enseGeneID",
"singleCellExpressions.effectSizes.i",
"singleCellExpressions.effectSizes.s",
"singleCellExpressions.effectSizes.c"
)
# Perform the cell search
<- cells_search(
cell_search_results
queryValues,
fieldsFilter,page = 1,
limit = 10,
searchType = "or",
orderBy = "geneID",
sortDirection = "asc",
responseType = "json",
organismType = list(9606),
options = list(api_key = "3147dda5fa023c9763d38bddb472dd28", timeout = 10000)
)# Display the results
# print(head(cell_search_results))
# This example demonstrates how to search for specific gene information using gene IDs. We use the `gene_search()` function to query gene-related details such as gene symbols and Ensembl gene IDs.
# Define the fields to search for and query gene IDs
<- list(c("geneID"))
queryFields <- c(1, 56, 70) # Specify the gene IDs to search for
queryValues <- "or"
searchType <- c("geneID", "symbol", "crossReference.enseGeneID")
fieldsFilter = c(9606)
organismType
# Perform the gene search
<- gene_search(queryFields, queryValues, fieldsFilter, searchType = searchType,
gene_search_results organismType = organismType, page = 1, limit = 10,
options = list(api_key = API_KEY, timeout = 1000))
# Display the results
# print(head(gene_search_results))
# In this example, we demonstrate how to use keywords to suggest relevant biological pathways. We use the `pathways_suggest()` function to find pathways related to specific biological processes or activities, such as "apoptosis" and "signal transduction".
# Define the query values for pathway suggestion
<- c("apoptosis", "signal transduction")
queryValues # Use the pathways_suggest function to get relevant pathway matches
<- pathways_suggest(queryValues)
pathway_suggest_results # Display the suggested pathways
# print(head(pathway_suggest_results))
# This example demonstrates how to perform feature engineering on gene expression data using external gene-related information. The goal is to convert raw gene expression data into pathway-level features for further analysis.
# Define gene expression data for a set of genes across multiple samples
<- data.frame(
input_data A1CF = c(2, 3, 3, 3), # Expression levels for A1CF
A2M = c(3, 4, 3, 3), # Expression levels for A2M
A4GALT = c(3, 4, 3, 4), # Expression levels for A4GALT
A4GNT = c(3, 4, 3, 3) # Expression levels for A4GNT
)
## deseq2 input_data to get relevantly expressed genes
## Fetch gene-related data from genular database
# Using `fetch_all_gene_search_results()`, we query gene information for the given symbols in `input_data`
<- fetch_all_gene_search_results(
all_gene_results queryFields = list(c("symbol")), # Specify that the query is based on gene symbols
queryValues = colnames(input_data), # Gene symbols extracted from the column names of 'input_data'
fieldsFilter = c("geneID", "symbol", "crossReference.enseGeneID",
"mRNAExpressions.proteinAtlas.c",
"ontology.id", "ontology.term", "ontology.cat"), # Fields to be fetched
searchType = "or", # Search type specifying how to combine query fields
orderBy = "geneID", # Order the results by gene ID
sortDirection = "asc", # Sort direction: ascending
responseType = "json", # Format of the response
matchType = "exact", # Type of matching to use for the query
organismType = list(c(9606)), # Specify the organism type (e.g., Homo sapiens)
ontologyCategories = list(), # Specify ontology categories if needed
limit = 100, # Limit the number of results returned
options = list(api_key = API_KEY, timeout = 1000) # API key and timeout for the request
)
## Transform and restructure the fetched gene-related data
# The `extract_data()` function helps map various gene attributes to a more usable format
<- extract_data(all_gene_results, list(
data_transposed "geneID" = "mappedGeneID", # Mapping gene IDs
"symbol" = "mappedSymbol", # Mapping gene symbols
"crossReference$enseGeneID" = "mappedEnseGeneID", # Mapping Ensembl gene IDs
"mRNAExpressions$proteinAtlas" = list(c("c" = "mappedC")), # Mapping mRNA expression
"ontology" = list(c("id" = "mappedId", "term" = "mappedTerm", "cat" = "mappedCat")) # Mapping ontology data
))
# Remove rows with missing ontology information
<- data_transposed[!is.na(data_transposed$mappedId), ]
data_transposed # View the first few rows of the transposed data
print(head(data_transposed[order(data_transposed$mappedId), ]))
#> mappedGeneID mappedSymbol mappedEnseGeneID mappedId mappedTerm mappedCat
#> 88 51146 A4GNT ENSG00000118017 GO:0000139 Golgi membrane 2
#> 105 53947 A4GALT ENSG00000128274 GO:0000139 Golgi membrane 2
#> 30 2 A2M ENSG00000175899 GO:0001553 luteinization 1
#> 101 53947 A4GALT ENSG00000128274 GO:0001576 globoside biosynthetic process 1
#> 31 2 A2M ENSG00000175899 GO:0001869 negative regulation of complement activation, lectin pathway 1
#> 14 2 A2M ENSG00000175899 GO:0002020 protease binding 0
## Convert gene expression data to pathway-level features
# Using the `convert_gene_expression_to_pathway_features()` function, we generate pathway-level features
<- convert_gene_expression_to_pathway_features(input_data, data_transposed, T)
final_data
# (Optional) Remove NA columns from the final data
# final_data <- final_data[, colSums(!is.na(final_data)) > 0]
# View the final data
# print(head(final_data))
# This example demonstrates how to retrieve genes associated with the "Adaptive Immune System" and process their corresponding cell marker scores for further analysis. We use `fetch_all_gene_search_results()` to obtain gene information, followed by data filtering and visualization steps.
# Fetch genes related to "Adaptive Immune System"
<- fetch_all_gene_search_results(
search_example_6 queryFields = list(c("ontology.term")),
queryValues = list(c("Adaptive Immune System")),
fieldsFilter = c("geneID", "symbol", "crossReference.enseGeneID",
"singleCellExpressions.effectSizes.i",
"singleCellExpressions.effectSizes.c",
"singleCellExpressions.effectSizes.s",
"singleCellExpressions.effectSizes.foldChange"),
searchType = "or",
orderBy = "geneID",
sortDirection = "asc",
responseType = "json",
matchType = "exact",
organismType = list(c(9606)),
limit = 100,
options = list(api_key = API_KEY, timeout = 1000)
)
library(dplyr)
library(tidyr)
library(purrr)
library(ggplot2)
<- function(gene, percentile = 90) {
filter_effect_sizes_top_percentile # Extract all foldChange values for the current gene
<- map_dbl(gene$singleCellExpressions$effectSizes, ~ .x$foldChange)
fold_changes
# Calculate the specified percentile (default is 90th percentile)
<- quantile(fold_changes, probs = percentile / 100, na.rm = TRUE)
cutoff
# Filter effectSizes to retain only those with foldChange >= cutoff
<- gene$singleCellExpressions$effectSizes %>%
filtered_effect_sizes keep(~ .x$foldChange >= cutoff)
# Update the gene's effectSizes with the filtered list
$singleCellExpressions$effectSizes <- filtered_effect_sizes
gene
return(gene)
}
<- function(gene, lineage = "root") {
filter_effect_sizes_by_lineage # Filter effectSizes to keep only those with cell_lineage == 'root'
<- gene$singleCellExpressions$effectSizes %>%
filtered_effect_sizes keep(~ .x$cell_lineage == lineage)
# Update the gene's effectSizes with the filtered list
$singleCellExpressions$effectSizes <- filtered_effect_sizes
gene
return(gene)
}
<- search_example_6 %>%
search_example_6_filtered map(~ filter_effect_sizes_top_percentile(.x, percentile = 95))
<- search_example_6_filtered %>%
search_example_6_filtered map(~ filter_effect_sizes_by_lineage(.x, lineage = "root"))
# Remove genes with no remaining effectSizes
<- search_example_6_filtered %>%
search_example_6_filtered ::keep(~ length(.x$singleCellExpressions$effectSizes) > 0)
purrr
# Sort genes by the number of cells in descending order
<- sapply(search_example_6_filtered, function(gene) {
num_cells_per_gene length(gene$singleCellExpressions$effectSizes)
})<- order(num_cells_per_gene, decreasing = TRUE)
ordered_indices <- search_example_6_filtered[ordered_indices]
search_example_6_filtered_sorted
# Convert the sorted and filtered list to a tidy data frame for visualization
<- search_example_6_filtered_sorted %>%
df_final map_dfr(function(gene) {
<- tibble(
gene_info geneID = gene$geneID,
symbol = gene$symbol,
ensemblID = gene$crossReference$enseGeneID
)<- gene$singleCellExpressions$effectSizes %>%
effect_sizes map_dfr(~ tibble(
cell_id = .x$cell_id,
context = .x$context,
markerScore = .x$markerScore,
scoreThreshold = .x$scoreThreshold,
foldChange = .x$foldChange,
cell_lineage = .x$cell_lineage,
cell_term = .x$cell_term
))bind_cols(gene_info, effect_sizes)
})
# Visualize fold changes by cell type for a selected gene
<- "HMGB1"
selected_gene <- df_final %>% filter(symbol == selected_gene)
df_filtered
ggplot(df_filtered, aes(x = reorder(cell_term, -foldChange), y = foldChange)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(
title = paste("Fold Change by Cell Type for Gene", selected_gene),
x = "Cell Type",
y = "Fold Change"
+
) theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, face = "bold")
)