Keyboard shortcuts

Press or to navigate between chapters

Press S or / to search in the book

Press ? to show this help

Press Esc to hide this help

NCBI Assembly Reports

Preparations

cbp install nwr
cbp install sqlite3
cbp install tva

Requires SQLite version 3.34 or above. sqlite that comes with mac does not work.

NCBI Taxonomy Statistics

curl -L "https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=statistics&?&unclassified=hide&uncultured=hide" |
    tva from html -q 'table[bgcolor="#CCCCFF"] table[bgcolor="#FFFFFF"] tr td text{}' |
    grep '\S' |
    paste -d $'\t' - - - - - - |
    tva to md --right 2-6
Ranks:higher taxagenusspecieslower taxatotal
Archaea03401,2002,2902,290
Bacteria05,78233,61590,21890,218
Eukaryota0104,261631,437804,447804,447
Fungi08,09574,50788,46088,460
Metazoa075,546340,416453,240453,240
Viridiplantae016,338198,532237,280237,280
Viruses363,49314,612200,795201,328
All taxa54113,878700,7621,097,7581,118,224

NCBI ASSEMBLY

  • assembly_level
for C in refseq genbank; do
    cat ~/.nwr/assembly_summary_${C}.txt |
        sed '1d' |
        tva stats -H -g assembly_level,genome_rep --count |
        tva keep-header -- sort |
        tva to md --fmt

    echo -e "\nTable: ${C}\n\n"
done
assembly_levelgenome_repcount
ChromosomeFull8,629
ChromosomePartial355
Complete GenomeFull76,533
Complete GenomePartial7
ContigFull280,107
ContigPartial30
ScaffoldFull158,032

Table: refseq

assembly_levelgenome_repcount
ChromosomeFull44,020
ChromosomePartial1,196
Complete GenomeFull309,100
Complete GenomePartial131
ContigFull2,549,556
ContigPartial933
ScaffoldFull515,294
ScaffoldPartial363

Table: genbank

Example 1: count qualified assemblies of Eukaryote groups

ARRAY=(
    # Animals - Metazoa - kingdom
    'Flatworms::Platyhelminthes' # phylum
    'Roundworms::Nematoda'
    'Insects::Hexapoda' # subphylum
    'Reptiles::Testudines' # order
    'Reptiles::Lepidosauria'
    'Reptiles::Crocodylia'
    'Fishes::Chondrichthyes' # class
    'Fishes::Dipnoi'
    'Fishes::Actinopterygii'
    'Fishes::Hyperotreti'
    'Fishes::Hyperoartia'
    'Fishes::Coelacanthimorpha'
    'Mammals::Mammalia'
    'Birds::Aves'
    'Amphibians::Amphibia'
    # Fungi - kindom
    'Ascomycetes::Ascomycota' # phylum
    'Basidiomycetes::Basidiomycota'
    # Plants - Viridiplantae
    'Green Plants::Viridiplantae'
    'Land Plants::Embryophyta'
    # Protists
    'Apicomplexans::Apicomplexa'
    'Kinetoplasts::Kinetoplastida'
)

echo -e "GROUP_NAME\tSCI_NAME\tComplete Genome\tChromosome\tScaffold\tContig" \
    > groups.tsv

for item in "${ARRAY[@]}" ; do
    GROUP_NAME="${item%%::*}"
    SCI_NAME="${item##*::}"

    GENUS=$(
        nwr member ${SCI_NAME} -r genus |
            grep -v -i "Candidatus " |
            grep -v -i "candidate " |
            sed '1d' |
            cut -f 1 |
            tr "\n" "," |
            sed 's/,$/\)/' |
            sed 's/^/\(/'
    )

    printf "$GROUP_NAME\t$SCI_NAME\t"

    for L in 'Complete Genome' 'Chromosome' 'Scaffold' 'Contig'; do
        echo "
            SELECT
                COUNT(*)
            FROM ar
            WHERE 1=1
                AND genus_id IN $GENUS
                AND assembly_level IN ('$L')
            " |
            sqlite3 -tabs ~/.nwr/ar_refseq.sqlite
    done |
    tr "\n" "\t" |
    sed 's/\t$//'

    echo;
done \
    >> groups.tsv

cat groups.tsv |
    tva to md --num

GROUP_NAMESCI_NAMEComplete GenomeChromosomeScaffoldContig
FlatwormsPlatyhelminthes0250
RoundwormsNematoda1430
InsectsHexapoda120810530
ReptilesTestudines01711
ReptilesLepidosauria02591
ReptilesCrocodylia0160
FishesChondrichthyes02610
FishesDipnoi0100
FishesActinopterygii1225399
FishesHyperotreti0100
FishesHyperoartia0400
FishesCoelacanthimorpha0100
MammalsMammalia4173897
BirdsAves1106545
AmphibiansAmphibia02931
AscomycetesAscomycota4749276162
BasidiomycetesBasidiomycota27184832
Green PlantsViridiplantae9155589
Land PlantsEmbryophyta7152538
ApicomplexansApicomplexa225393
KinetoplastsKinetoplastida11373

Table: refseq - Eukaryotes

GROUP_NAMESCI_NAMEComplete GenomeChromosomeScaffoldContig
FlatwormsPlatyhelminthes0478920
RoundwormsNematoda4157348218
InsectsHexapoda21351333892573
ReptilesTestudines1595010
ReptilesLepidosauria011728130
ReptilesCrocodylia05140
FishesChondrichthyes056606
FishesDipnoi0402
FishesActinopterygii3111112107320
FishesHyperotreti0430
FishesHyperoartia07144
FishesCoelacanthimorpha0130
MammalsMammalia2514712280973
BirdsAves34472191330
AmphibiansAmphibia09318612
AscomycetesAscomycota4681312108726713
BasidiomycetesBasidiomycota12718817461247
Green PlantsViridiplantae252420328951261
Land PlantsEmbryophyta220413226881024
ApicomplexansApicomplexa2013219989
KinetoplastsKinetoplastida1672119104

Table: genbank - Eukaryotes

Example 2: count qualified assemblies of Prokaryote groups

echo -e "GROUP_NAME\tComplete Genome\tChromosome\tScaffold\tContig" \
    > groups.tsv

for item in Bacteria Archaea ; do
    PHYLUM=$(
        nwr member ${item} -r phylum |
            grep -v -i "Candidatus " |
            grep -v -i "candidate " |
            sed '1d' |
            cut -f 2 |
            sort
    )

    echo -e "$item\t\t\t\t"

    for P in $PHYLUM; do
        GENUS=$(
            nwr member ${P} -r genus |
                grep -v -i "Candidatus " |
                grep -v -i "candidate " |
                sed '1d' |
                cut -f 1 |
                tr "\n" "," |
                sed 's/,$/\)/' |
                sed 's/^/\(/'
        )

        if [[ ${#GENUS} -lt 3 ]]; then
            >&2 echo $P has no genera
            continue
        fi

        printf "$P\t"

        for L in 'Complete Genome' 'Chromosome' 'Scaffold' 'Contig'; do
            echo "
                SELECT
                    COUNT(*)
                FROM ar
                WHERE 1=1
                    AND genus_id IN $GENUS
                    AND assembly_level IN ('$L')
                " |
                sqlite3 -tabs ~/.nwr/ar_refseq.sqlite
        done |
        tr "\n" "\t" |
        sed 's/\t$//'

        echo;
    done
done  \
    >> groups.tsv

cat groups.tsv |
    tva to md --right 2-5

GROUP_NAMEComplete GenomeChromosomeScaffoldContig
Bacteria
Abditibacteriota1001
Acidobacteriota47113867
Actinomycetota60509762612420668
Aquificota2522667
Armatimonadota3448
Atribacterota3012
Bacillota1411516024296669650
Bacteroidota1997284792810745
Balneolota311539
Bdellovibrionota49104844
Caldisericota1092
Calditrichota1103
Campylobacterota148211625848148
Chlamydiota3039054193
Chlorobiota161936
Chloroflexota54163109
Chrysiogenota3050
Coprothermobacterota1012
Cyanobacteriota416448031331
Deferribacterota90922
Deinococcota1135142234
Dictyoglomota7061
Elusimicrobiota4001
Fibrobacterota202360
Fidelibacterota1000
Fusobacteriota2629211472
Gemmatimonadota101948
Ignavibacteriota30512
Kiritimatiellota2006
Lentisphaerota20123
Minisyncoccota1000
Mycoplasmatota953713821135
Myxococcota131937148
Nitrospinota10110
Nitrospirota2401924
Planctomycetota863061117
Pseudomonadota33304359771037157832
Rhodothermota1934199
Spirochaetota4672843731411
Synergistota12449110
Thermodesulfobacteriota18612279487
Thermodesulfobiota2002
Thermomicrobiota2039
Thermosulfidibacterota1000
Thermotogota61110599
Verrucomicrobiota1499237272
Vulcanimicrobiota1000
Zhurongbacterota1000
Archaea
Methanobacteriota523215471147
Microcaldota0000
Nanobdellota1000
Nitrososphaerota2131226
Promethearchaeota1000
Thermoplasmatota160975
Thermoproteota1336117127

Table: refseq - Prokaryotes

GROUP_NAMEComplete GenomeChromosomeScaffoldContig
Bacteria
Abditibacteriota11511
Acidobacteriota5613160612
Actinomycetota63718453327633974
Aquificota22282172
Armatimonadota413057
Atribacterota3057
Bacillota16847190687271465348
Bacteroidota21433141709630441
Balneolota1354396
Bdellovibrionota5310147223
Caldisericota10204
Calditrichota11742
Campylobacterota27991626074157198
Chlamydiota40879118225
Chlorobiota1713067
Chloroflexota571286375
Chrysiogenota3020
Coprothermobacterota101410
Cyanobacteriota4688114663874
Deferribacterota70520266
Deinococcota1185193282
Dictyoglomota70155
Elusimicrobiota40145
Fibrobacterota20109199
Fidelibacterota1000
Fusobacteriota29314258906
Gemmatimonadota8133167
Ignavibacteriota316245
Kiritimatiellota201348
Lentisphaerota201255
Minisyncoccota1001
Mycoplasmatota11322624471561
Myxococcota1371078351
Nitrospinota101367
Nitrospirota355307456
Planctomycetota10533172699
Pseudomonadota4202546391227191437144
Rhodothermota20352260
Spirochaetota5787136772696
Synergistota144127239
Thermodesulfobacteriota189116871767
Thermodesulfobiota2056
Thermomicrobiota20834
Thermosulfidibacterota1013
Thermotogota561232219
Verrucomicrobiota1641114322010
Vulcanimicrobiota1000
Zhurongbacterota1000
Archaea
Methanobacteriota5432512962504
Microcaldota0000
Nanobdellota2001
Nitrososphaerota4322200653
Promethearchaeota1006
Thermoplasmatota18046213
Thermoproteota1376476436

Table: genbank - Prokaryotes

Example 3: find accessions of a species

Staphylococcus capitis - 29388 - 头状葡萄球菌

nwr info "Staphylococcus capitis"

nwr member 29388

echo '
.headers ON
    SELECT
        organism_name,
        species,
        genus,
        ftp_path,
        assembly_level
    FROM ar
    WHERE 1=1
        AND tax_id != species_id    -- with strain ID
        AND species_id IN (29388)
    ' |
    sqlite3 -tabs ~/.nwr/ar_refseq.sqlite \
    > Scap.assembly.tsv

echo '
    SELECT
        species || " " || REPLACE(assembly_accession, ".", "_") AS organism_name,
        species,
        genus,
        ftp_path,
        assembly_level
    FROM ar
    WHERE 1=1
        AND tax_id = species_id     -- no strain ID
        AND assembly_level IN ("Chromosome", "Complete Genome")
        AND species_id IN (29388)
    ' |
    sqlite3 -tabs ~/.nwr/ar_refseq.sqlite \
    >> Scap.assembly.tsv

Example 4: find model organisms in a family

echo "
.headers ON
    SELECT
        tax_id,
        organism_name
    FROM ar
    WHERE 1=1
        AND family IN ('Enterobacteriaceae')
        AND refseq_category IN ('reference genome')
    " |
    sqlite3 -tabs ~/.nwr/ar_refseq.sqlite |
    sed '1s/^/#/' |
    tva to md

#tax_idorganism_name
511145Escherichia coli str. K-12 substr. MG1655
198214Shigella flexneri 2a str. 301
99287Salmonella enterica subsp. enterica serovar Typhimurium str. LT2
386585Escherichia coli O157:H7 str. Sakai
1125630Klebsiella pneumoniae subsp. pneumoniae HS11286