27 NCBI lookup
Automating Sequence Retrieval
To practice what we’ve discussed today, copy the following code chunk into a file called fetch_genes.py, then review and comment the code.
TipDocumentation tags
To add Roxygen-like documentation tags to a python function, we can add a multi-line quoted string at the top of the function like this:
def add(x, y):
"""
Add two numbers.
Args:
x: A number.
y: A number.
Returns:
The sum of x and y.
"""
return x + ySee Section 3.8 of the Google Python Style Guide for additional details.
import requests
import time
import sys
def get_ncbi_id(gene_symbol, organism="human"):
url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
params = {
"db": "nucleotide",
"term": f"{gene_symbol}[Gene] AND {organism}[Organism] AND refseq[Filter]",
"retmode": "json"
}
try:
response = requests.get(url, params=params)
response.raise_for_status()
data = response.json()
id_list = data.get('esearchresult', {}).get('idlist', [])
if id_list:
return id_list[0]
else:
print(f"Warning: No ID found for {gene_symbol}.")
return None
except requests.exceptions.RequestException as e:
print(f"Error connecting to NCBI ESearch: {e}")
return None
def fetch_fasta(ncbi_id):
url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
params = {
"db": "nucleotide",
"id": ncbi_id,
"rettype": "fasta",
"retmode": "text"
}
try:
response = requests.get(url, params=params)
response.raise_for_status()
return response.text
except requests.exceptions.RequestException as e:
print(f"Error fetching sequence for ID {ncbi_id}: {e}")
return None
def main():
genes_of_interest = ["TP53", "BRCA1", "EGFR"]
output_filename = "pathway_sequences.fasta"
print(f"Starting retrieval for {len(genes_of_interest)} genes...")
with open(output_filename, "w") as outfile:
for gene in genes_of_interest:
print(f"Processing {gene}...")
gene_id = get_ncbi_id(gene)
if gene_id:
time.sleep(0.5)
fasta_seq = fetch_fasta(gene_id)
if fasta_seq:
outfile.write(fasta_seq)
if not fasta_seq.endswith('\n'):
outfile.write('\n')
print(f" -> Successfully saved {gene} (ID: {gene_id})")
time.sleep(0.5)
print(f"\nDone! All sequences saved to {output_filename}")
if __name__ == "__main__":
main()This is a nice proof of principle, but it would be much more useful if we could give it a dynamic list of genes. Add the following code to the main() function to make it useable from the command line.
# Check if the user provided any gene arguments
# sys.argv is a list where index 0 is the script name, and subsequent indices are the arguments
if len(sys.argv) < 2:
print("Usage: python fetch_genes.py <gene1> <gene2> [gene3...]")
print("Example: python fetch_genes.py TP53 BRCA1 EGFR")
sys.exit(1) # Exit the script cleanly with an error status
# Capture all arguments provided after the script name
genes_of_interest = sys.argv[1:]Assignment
When you have finished documenting, generalizing and testing your fetch_genes.py script, upload the file to Blackboard.