27  NCBI lookup

Author
Affiliation

Dr Randy Johnson

Hood College

Published

March 26, 2026

Automating Sequence Retrieval

To practice what we’ve discussed today, copy the following code chunk into a file called fetch_genes.py, then review and comment the code.

TipDocumentation tags

To add Roxygen-like documentation tags to a python function, we can add a multi-line quoted string at the top of the function like this:

def add(x, y):
    """
    Add two numbers.

    Args:
        x: A number.
        y: A number.

    Returns:
        The sum of x and y.
    """
    return x + y

See Section 3.8 of the Google Python Style Guide for additional details.

import requests
import time
import sys

def get_ncbi_id(gene_symbol, organism="human"):
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        "db": "nucleotide",
        "term": f"{gene_symbol}[Gene] AND {organism}[Organism] AND refseq[Filter]",
        "retmode": "json"
    }
    
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        data = response.json()
        
        id_list = data.get('esearchresult', {}).get('idlist', [])
        if id_list:
            return id_list[0]
        else:
            print(f"Warning: No ID found for {gene_symbol}.")
            return None
            
    except requests.exceptions.RequestException as e:
        print(f"Error connecting to NCBI ESearch: {e}")
        return None

def fetch_fasta(ncbi_id):
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {
        "db": "nucleotide",
        "id": ncbi_id,
        "rettype": "fasta",
        "retmode": "text"
    }
    
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching sequence for ID {ncbi_id}: {e}")
        return None

def main():
    genes_of_interest = ["TP53", "BRCA1", "EGFR"]
    output_filename = "pathway_sequences.fasta"
    
    print(f"Starting retrieval for {len(genes_of_interest)} genes...")
    
    with open(output_filename, "w") as outfile:
        for gene in genes_of_interest:
            print(f"Processing {gene}...")
            
            gene_id = get_ncbi_id(gene)
            
            if gene_id:
                time.sleep(0.5) 
                
                fasta_seq = fetch_fasta(gene_id)
                
                if fasta_seq:
                    outfile.write(fasta_seq)

                    if not fasta_seq.endswith('\n'):
                        outfile.write('\n')
                    print(f"  -> Successfully saved {gene} (ID: {gene_id})")
            
            time.sleep(0.5)
            
    print(f"\nDone! All sequences saved to {output_filename}")

if __name__ == "__main__":
    main()

This is a nice proof of principle, but it would be much more useful if we could give it a dynamic list of genes. Add the following code to the main() function to make it useable from the command line.

# Check if the user provided any gene arguments
# sys.argv is a list where index 0 is the script name, and subsequent indices are the arguments
if len(sys.argv) < 2:
    print("Usage: python fetch_genes.py <gene1> <gene2> [gene3...]")
    print("Example: python fetch_genes.py TP53 BRCA1 EGFR")
    sys.exit(1) # Exit the script cleanly with an error status

# Capture all arguments provided after the script name
genes_of_interest = sys.argv[1:]

Assignment

When you have finished documenting, generalizing and testing your fetch_genes.py script, upload the file to Blackboard.