{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# `natsel_zhang` -- a branch-site test\n", "\n", "This is the hypothesis test presented in [Zhang et al](https://www.ncbi.nlm.nih.gov/pubmed/16107592). This test evaluates the hypothesis that a set of sites have undergone positive natural selection on a pre-specified set of lineages.\n", "\n", "For this model class, there are groups of branches for which all positions are evolving neutrally but some proportion of those neutrally evolving sites change to adaptively evolving on so-called foreground edges. For the current example, we'll define the Chimpanzee and Human branches as foreground and everything else as background. The following table defines the parameter scopes.\n", "\n", "| Site Class | Proportion | Background Edges | Foreground Edges |\n", "|------------|---------------|---------------------------|---------------------------|\n", "| 0 | p0 | 0 < omega0 < 1 | 0 < omega0 < 1 |\n", "| 1 | p1 | omega1 = 1 | omega1 = 1 |\n", "| 2a | p2 | 0 < omega0 < 1 | 0 < omega2 > 1 |\n", "| 2b | p3 | omega1 = 1 | 0 < omega0 < 1 |\n", "\n", "**NOTE:** Our implementation is not as parametrically succinct as that of Zhang et al, we have 1 additional bin probability." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Statistics
LRdfpvalue
4.964730.1744
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
hypothesiskeylnLnfpDLCunique_Q
null'GNC-null'-6708.311924True
alt'GNC-alt'-6705.829627True
\n" ], "text/plain": [ "Statistics\n", "======================\n", " LR df pvalue\n", "----------------------\n", "4.9647 3 0.1744\n", "----------------------\n", "=================================================================\n", "hypothesis key lnL nfp DLC unique_Q\n", "-----------------------------------------------------------------\n", " null 'GNC-null' -6708.3119 24 True \n", " alt 'GNC-alt' -6705.8296 27 True \n", "-----------------------------------------------------------------" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from cogent3.app import io, evo\n", "\n", "loader = io.load_aligned(format=\"fasta\", moltype=\"dna\")\n", "aln = loader(\"../data/primate_brca1.fasta\")\n", "\n", "zhang_test = evo.natsel_zhang(\"GNC\",\n", " tree=\"../data/primate_brca1.tree\",\n", " optimise_motif_probs=False,\n", " tip1=\"Human\",\n", " tip2=\"Chimpanzee\")\n", "\n", "result = zhang_test(aln)\n", "result" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

GNC-alt

\n", "

log-likelihood = -6705.8296

\n", "

number of free parameters = 27

\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Global params
A>CA>GA>TC>AC>GC>TG>AG>CG>TT>A
0.85543.53430.97441.65862.19376.25858.01041.24180.79421.2667
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
T>C
2.9645
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Bin params
binbprobs
00.0532
10.2655
2a0.0403
2b0.6410
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Edge params
edgeparentlength
Galagoroot0.5419
HowlerMonroot0.1359
Rhesusedge.30.0648
Orangutanedge.20.0235
Gorillaedge.10.0075
Humanedge.00.0182
Chimpanzeeedge.00.0085
edge.0edge.10.0000
edge.1edge.20.0099
edge.2edge.30.0365
edge.3root0.0234
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Edge bin params
edgebinomega
Galago00.0000
Galago11.0000
Galago2a0.0000
Galago2b1.0000
HowlerMon00.0000
HowlerMon11.0000
HowlerMon2a0.0000
HowlerMon2b1.0000
Rhesus00.0000
Rhesus11.0000
Rhesus2a0.0000
Rhesus2b1.0000
Orangutan00.0000
Orangutan11.0000
Orangutan2a0.0000
Orangutan2b1.0000
Gorilla00.0000
Gorilla11.0000
Gorilla2a0.0000
Gorilla2b1.0000
Human00.0000
Human11.0000
Human2a20.0000
Human2b20.0000
Chimpanzee00.0000
Chimpanzee11.0000
Chimpanzee2a20.0000
Chimpanzee2b20.0000
edge.000.0000
edge.011.0000
edge.02a0.0000
edge.02b1.0000
edge.100.0000
edge.111.0000
edge.12a0.0000
edge.12b1.0000
edge.200.0000
edge.211.0000
edge.22a0.0000
edge.22b1.0000
edge.300.0000
edge.311.0000
edge.32a0.0000
edge.32b1.0000
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Motif params
AAAAACAAGAATACAACCACGACTAGAAGC
0.05560.02350.03440.05560.02280.00460.00080.02890.02310.0286
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
AGGAGTATAATCATGATTCAACACCAGCAT
0.01400.03810.01860.00700.01280.01920.01960.00520.02380.0221
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
CCACCCCCGCCTCGACGCCGGCGTCTACTC
0.01950.00620.00060.02630.00110.00090.00230.00320.01370.0078
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
CTGCTTGAAGACGAGGATGCAGCCGCGGCT
0.01250.01050.07550.01050.03030.03150.01580.00960.00140.0137
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
GGAGGCGGGGGTGTAGTCGTGGTTTACTAT
0.01610.00900.00670.01330.01480.00700.00690.02130.00230.0101
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
TCATCCTCGTCTTGCTGGTGTTTATTCTTG
0.02210.00820.00150.02510.00180.00400.02010.02120.00780.0108
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
TTT
0.0187
\n" ], "text/plain": [ "GNC-alt\n", "log-likelihood = -6705.8296\n", "number of free parameters = 27\n", "============================================================================\n", " A>C A>G A>T C>A C>G C>T G>A G>C\n", "----------------------------------------------------------------------------\n", "0.8554 3.5343 0.9744 1.6586 2.1937 6.2585 8.0104 1.2418\n", "----------------------------------------------------------------------------\n", "\n", "continued: \n", "==========================\n", " G>T T>A T>C\n", "--------------------------\n", "0.7942 1.2667 2.9645\n", "--------------------------\n", "\n", "=============\n", "bin bprobs\n", "-------------\n", " 0 0.0532\n", " 1 0.2655\n", " 2a 0.0403\n", " 2b 0.6410\n", "-------------\n", "==============================\n", " edge parent length\n", "------------------------------\n", " Galago root 0.5419\n", " HowlerMon root 0.1359\n", " Rhesus edge.3 0.0648\n", " Orangutan edge.2 0.0235\n", " Gorilla edge.1 0.0075\n", " Human edge.0 0.0182\n", "Chimpanzee edge.0 0.0085\n", " edge.0 edge.1 0.0000\n", " edge.1 edge.2 0.0099\n", " edge.2 edge.3 0.0365\n", " edge.3 root 0.0234\n", "------------------------------\n", "============================\n", " edge bin omega\n", "----------------------------\n", " Galago 0 0.0000\n", " Galago 1 1.0000\n", " Galago 2a 0.0000\n", " Galago 2b 1.0000\n", " HowlerMon 0 0.0000\n", " HowlerMon 1 1.0000\n", " HowlerMon 2a 0.0000\n", " HowlerMon 2b 1.0000\n", " Rhesus 0 0.0000\n", " Rhesus 1 1.0000\n", " Rhesus 2a 0.0000\n", " Rhesus 2b 1.0000\n", " Orangutan 0 0.0000\n", " Orangutan 1 1.0000\n", " Orangutan 2a 0.0000\n", " Orangutan 2b 1.0000\n", " Gorilla 0 0.0000\n", " Gorilla 1 1.0000\n", " Gorilla 2a 0.0000\n", " Gorilla 2b 1.0000\n", " Human 0 0.0000\n", " Human 1 1.0000\n", " Human 2a 20.0000\n", " Human 2b 20.0000\n", "Chimpanzee 0 0.0000\n", "Chimpanzee 1 1.0000\n", "Chimpanzee 2a 20.0000\n", "Chimpanzee 2b 20.0000\n", " edge.0 0 0.0000\n", " edge.0 1 1.0000\n", " edge.0 2a 0.0000\n", " edge.0 2b 1.0000\n", " edge.1 0 0.0000\n", " edge.1 1 1.0000\n", " edge.1 2a 0.0000\n", " edge.1 2b 1.0000\n", " edge.2 0 0.0000\n", " edge.2 1 1.0000\n", " edge.2 2a 0.0000\n", " edge.2 2b 1.0000\n", " edge.3 0 0.0000\n", " edge.3 1 1.0000\n", " edge.3 2a 0.0000\n", " edge.3 2b 1.0000\n", "----------------------------\n", "============================================================================\n", " AAA AAC AAG AAT ACA ACC ACG ACT\n", "----------------------------------------------------------------------------\n", "0.0556 0.0235 0.0344 0.0556 0.0228 0.0046 0.0008 0.0289\n", "----------------------------------------------------------------------------\n", "\n", "continued: \n", "============================================================================\n", " AGA AGC AGG AGT ATA ATC ATG ATT\n", "----------------------------------------------------------------------------\n", "0.0231 0.0286 0.0140 0.0381 0.0186 0.0070 0.0128 0.0192\n", "----------------------------------------------------------------------------\n", "\n", "continued: \n", "============================================================================\n", " CAA CAC CAG CAT CCA CCC CCG CCT\n", "----------------------------------------------------------------------------\n", "0.0196 0.0052 0.0238 0.0221 0.0195 0.0062 0.0006 0.0263\n", "----------------------------------------------------------------------------\n", "\n", "continued: \n", "============================================================================\n", " CGA CGC CGG CGT CTA CTC CTG CTT\n", "----------------------------------------------------------------------------\n", "0.0011 0.0009 0.0023 0.0032 0.0137 0.0078 0.0125 0.0105\n", "----------------------------------------------------------------------------\n", "\n", "continued: \n", "============================================================================\n", " GAA GAC GAG GAT GCA GCC GCG GCT\n", "----------------------------------------------------------------------------\n", "0.0755 0.0105 0.0303 0.0315 0.0158 0.0096 0.0014 0.0137\n", "----------------------------------------------------------------------------\n", "\n", "continued: \n", "============================================================================\n", " GGA GGC GGG GGT GTA GTC GTG GTT\n", "----------------------------------------------------------------------------\n", "0.0161 0.0090 0.0067 0.0133 0.0148 0.0070 0.0069 0.0213\n", "----------------------------------------------------------------------------\n", "\n", "continued: \n", "============================================================================\n", " TAC TAT TCA TCC TCG TCT TGC TGG\n", "----------------------------------------------------------------------------\n", "0.0023 0.0101 0.0221 0.0082 0.0015 0.0251 0.0018 0.0040\n", "----------------------------------------------------------------------------\n", "\n", "continued: \n", "==============================================\n", " TGT TTA TTC TTG TTT\n", "----------------------------------------------\n", "0.0201 0.0212 0.0078 0.0108 0.0187\n", "----------------------------------------------" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result.alt.lf" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Getting the posterior probabilities of site-class membership" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
012345678910
00.0760.0430.0000.0670.0590.0800.0430.0610.0520.0410.039
10.2550.2700.2930.2590.2630.2530.2700.2620.2660.2710.272
2a0.0570.0330.0000.0500.0440.0600.0330.0460.0400.0320.030
2b0.6130.6540.7070.6240.6340.6080.6540.6320.6430.6570.659
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
111213141516171819
00.0800.0480.0000.0800.2620.0410.0360.0590.062
10.2530.2680.2930.2530.1560.2710.2730.2630.261
2a0.0600.0370.0000.0590.2020.0320.0280.0440.047
2b0.6080.6480.7070.6080.3790.6570.6640.6340.630
\n" ], "text/plain": [ "==========================================================================================================================================================================================================\n", " 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19\n", "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", " 0 0.0759 0.0427 0.0000 0.0670 0.0586 0.0800 0.0430 0.0608 0.0519 0.0411 0.0392 0.0800 0.0480 0.0000 0.0797 0.2618 0.0411 0.0355 0.0586 0.0620\n", " 1 0.2546 0.2700 0.2929 0.2588 0.2628 0.2527 0.2699 0.2617 0.2658 0.2706 0.2716 0.2527 0.2676 0.2926 0.2528 0.1564 0.2706 0.2733 0.2628 0.2611\n", "2a 0.0568 0.0329 0.0000 0.0504 0.0444 0.0597 0.0331 0.0460 0.0396 0.0317 0.0303 0.0597 0.0367 0.0000 0.0595 0.2023 0.0317 0.0275 0.0444 0.0468\n", "2b 0.6127 0.6543 0.7071 0.6237 0.6343 0.6076 0.6540 0.6315 0.6428 0.6566 0.6589 0.6076 0.6477 0.7074 0.6080 0.3794 0.6566 0.6636 0.6343 0.6301\n", "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bprobs = result.alt.lf.get_bin_probs()\n", "bprobs[:, :20]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Getting all the statistics in tabular form" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "5x tabular_result('global params': Table, 'bin params': Table, 'edge params': Table, 'edge bin params': Table)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tab = evo.tabulate_stats()\n", "stats = tab(result.alt)\n", "stats" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
edge bin params
edgebinomega
Galago00.0000
Galago11.0000
Galago2a0.0000
Galago2b1.0000
HowlerMon00.0000
HowlerMon11.0000
HowlerMon2a0.0000
HowlerMon2b1.0000
Rhesus00.0000
Rhesus11.0000
\n", "

\n", "10 rows x 3 columns

" ], "text/plain": [ "edge bin params\n", "==========================\n", " edge bin omega\n", "--------------------------\n", " Galago 0 0.0000\n", " Galago 1 1.0000\n", " Galago 2a 0.0000\n", " Galago 2b 1.0000\n", "HowlerMon 0 0.0000\n", "HowlerMon 1 1.0000\n", "HowlerMon 2a 0.0000\n", "HowlerMon 2b 1.0000\n", " Rhesus 0 0.0000\n", " Rhesus 1 1.0000\n", "--------------------------\n", "\n", "10 rows x 3 columns" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stats[\"edge bin params\"][:10] # truncating the table" ] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:c3dev] *", "language": "python", "name": "conda-env-c3dev-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "state": {}, "version_major": 2, "version_minor": 0 } } }, "nbformat": 4, "nbformat_minor": 4 }