{"componentChunkName":"component---node-modules-gatsby-theme-try-ghost-src-templates-post-js","path":"/causal-machine-learning-part-5/","result":{"data":{"ghostPost":{"id":"Ghost__Post__63b58208bdc6867fe35526ff","title":"Causal Machine Learning - Part 5","slug":"causal-machine-learning-part-5","featured":false,"feature_image":"https://images.unsplash.com/photo-1648007547791-404a2abfdc82?crop=entropy&cs=tinysrgb&fit=max&fm=jpg&ixid=MnwxMTc3M3wwfDF8c2VhcmNofDY5Mnx8cm9ib3R8ZW58MHx8fHwxNjcyODM5Njc3&ixlib=rb-4.0.3&q=80&w=2000","excerpt":"This post is the fifth post of the series on Causal Machine Learning.  This blog post is based on the work of Judea Pearl. As always i will try to keep the things as simple as possible. So stay with me , Enjoy reading !","custom_excerpt":"This post is the fifth post of the series on Causal Machine Learning.  This blog post is based on the work of Judea Pearl. As always i will try to keep the things as simple as possible. So stay with me , Enjoy reading !","visibility":"public","created_at_pretty":"4 Jan 2023","published_at_pretty":"4 Jan 2023","updated_at_pretty":"20 Feb 2023","created_at":"2023-01-04T19:11:28.000+05:30","published_at":"2023-01-04T19:12:15.000+05:30","updated_at":"2023-02-20T09:33:55.000+05:30","meta_title":null,"meta_description":null,"og_description":null,"og_image":null,"og_title":null,"twitter_description":null,"twitter_image":null,"twitter_title":null,"authors":[{"slug":"amaljith","url":"http://localhost:2368/author/amaljith/","name":"Amaljith","bio":"Research Scholar @ IIT Kharagpur","cover_image":null,"profile_image":"http://localhost:2368/content/images/2022/09/Screenshot-from-2022-09-07-18-00-00.png","location":null,"website":null,"twitter":null,"facebook":null,"meta_title":null,"meta_description":null,"coverImageSharp":null,"profileImageSharp":null}],"primary_author":{"slug":"amaljith","url":"http://localhost:2368/author/amaljith/","name":"Amaljith","bio":"Research Scholar @ IIT Kharagpur","cover_image":null,"profile_image":"http://localhost:2368/content/images/2022/09/Screenshot-from-2022-09-07-18-00-00.png","location":null,"website":null,"twitter":null,"facebook":null,"meta_title":null,"meta_description":null,"coverImageSharp":null,"profileImageSharp":{"base":"Screenshot-from-2022-09-07-18-00-00.png","publicURL":"/static/28e31bfedd96b4afe90237d2c1f700c3/Screenshot-from-2022-09-07-18-00-00.png","imageMeta":{"width":316,"height":237},"childImageSharp":{"fluid":{"base64":"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAAPCAYAAADkmO9VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAEJElEQVQ4yyXO21fTBQDA8d9fUh5LQAW1eEImbIPJbSB3kZtijjSBxNvAGSAqlJZ64GAHIQ0VjkKICCiCQ8cYsYFjYwhsYxeGjJsopmEnH76dYw+f94/w2jmKfaSX9wtTrC1a+bBk5d83dlbnzFhHehnq62TBZeLjGyfvFqf4a9nGstuAx/oM96Qai76NieFu5u0jLLrNCCsOI47nat56J1hbtLH60kTfg5tUV6hQFeSSFLGTgtwcBtUteGc0zLoGmXNpmXU8Yd6txTOlxvq8hzmr/hNhyTaC06hmya3nn6VxHKMPKS08RFF+HgWK/aiO5JP3TQ4xMgn3mqt599bC8pyBRY+OxRkdXkc/rrE+POZ+nMY+hFXHKPPWP5m3a/i4Yqax9kfCREEkyKPJSk3mQFYmOelpRISGEBMuwahv4/WC4dNuwa3l1Us906O9jGs7sWg7EHQdv6NurWVmvJv33iHKTuSyft06tm3xJygwEIlIxDb/zUhFIvw2+HDrxkX+XjUx5x7A6+pn3qXBZnyIoacFTftNhKarlTxprsMz3sPbWR2jmiYiJSICt27lqwB/tmza+L8Af5KkYsYNHXhcvbidahy2x0ya2jFqm7lxqYxrF04jvLQMYB16jKazHrOulRWHjiJFOus/+5yAjX5s8vNhs58vfj5foooO43LJMaovncI1/ZhpWxcTlk6mRjrpa7uOrqsJYcaiY25imKftv6F52IBx4D61Z46y6Yv1+PpsYLOvD36+G0gJDuRcrIS0nTvY8XUAl66osDq60A3e5sHtXzCoW3CY+hHeeKdZdL5gytDDgsOAe1zL9KiawowEsoK2URi+HWWMmKvZuzibKuNcqoTiWBEXShToLHfo7K6h+9413DYzS/MehJnJYZxjOowDHdjHnjE53M2kUU1/zVmu742lOlNOkyKRmtzd1KkUtObFcacgmUetVTT0/kTDvQpemPoxDWmoralG8NrNvJqZwmbS8Xp5FrfNhMtuxtNei75Kia3xAmtdVZgaf6axvprKvXH8kB5FXd1lFMcyuVp7Hnd9E0+bb5Cdvgfhw4oXPq5hGR5kzGzkUVcPVeWlPK8tx/BrCfcrjnI+bx+KPSmcPJxP2QklFUolRfmHyf/uEEXK49y9XMndaxdJjpQirCy4WLCPoe38A4Nez8XySjISU5CFhCIO3k5wUDASkZiUuHj2Z2ahyN5HvuIgRUeOcUZZTOlJJeUlKgbU7Txta0AYbLnFqewMrpSepvhIITm700nblYg8IpJIWRjyiAjiI6JJjJKTGBtHcnwCqYlJpCelkJOWwfe533Jw337Kik9g1N5HqC9RcfyAgvTUPcRHyYmWypCFiJGKQggNDkYsEiEJEhEmCiE8REyYWIJMKiVCEoY8fCcJUTHEx8QgCw0l/8Be/gOTiTiUD46AHAAAAABJRU5ErkJggg==","aspectRatio":1.3333333333333333,"src":"/static/28e31bfedd96b4afe90237d2c1f700c3/6ccb0/Screenshot-from-2022-09-07-18-00-00.png","srcSet":"/static/28e31bfedd96b4afe90237d2c1f700c3/7d89d/Screenshot-from-2022-09-07-18-00-00.png 28w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/f4091/Screenshot-from-2022-09-07-18-00-00.png 55w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/6ccb0/Screenshot-from-2022-09-07-18-00-00.png 110w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/30481/Screenshot-from-2022-09-07-18-00-00.png 165w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/33bd6/Screenshot-from-2022-09-07-18-00-00.png 220w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/d398b/Screenshot-from-2022-09-07-18-00-00.png 316w","srcWebp":"/static/28e31bfedd96b4afe90237d2c1f700c3/8678c/Screenshot-from-2022-09-07-18-00-00.webp","srcSetWebp":"/static/28e31bfedd96b4afe90237d2c1f700c3/59cda/Screenshot-from-2022-09-07-18-00-00.webp 28w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/7da75/Screenshot-from-2022-09-07-18-00-00.webp 55w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/8678c/Screenshot-from-2022-09-07-18-00-00.webp 110w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/f282e/Screenshot-from-2022-09-07-18-00-00.webp 165w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/a7b21/Screenshot-from-2022-09-07-18-00-00.webp 220w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/fb2b8/Screenshot-from-2022-09-07-18-00-00.webp 316w","sizes":"(max-width: 110px) 100vw, 110px"}}}},"primary_tag":{"slug":"machine-learning","url":"http://localhost:2368/tag/machine-learning/","name":"Machine Learning","visibility":"public","feature_image":null,"description":null,"meta_title":null,"meta_description":null,"featureImageSharp":null},"tags":[{"slug":"machine-learning","url":"http://localhost:2368/tag/machine-learning/","name":"Machine Learning","visibility":"public","feature_image":null,"description":null,"meta_title":null,"meta_description":null,"featureImageSharp":null},{"slug":"artificial-intelligence","url":"http://localhost:2368/tag/artificial-intelligence/","name":"Artificial Intelligence","visibility":"public","feature_image":null,"description":null,"meta_title":null,"meta_description":null,"featureImageSharp":null},{"slug":"causal-machine-learning","url":"http://localhost:2368/tag/causal-machine-learning/","name":"Causal Machine Learning","visibility":"public","feature_image":null,"description":null,"meta_title":null,"meta_description":null,"featureImageSharp":null}],"plaintext":"This post is the fifth post of the series on Causal Machine Learning. As stated before, the starting point for all causal inference is a causal model. Usually, however, we don’t have a good causal model in hand. This is where Causal Discovery can be helpful. In this post Causal Discovery will be discussed in detail.  As always i will try to keep the things as simple as possible. So stay with me , Enjoy reading !\n\n\nWhat is Causal Discovery?\n\n\nCausal inference focuses on estimating the causal effect of a specific intervention or exposure on an outcome.\n\n\n\nCausal discovery focuses on identifying the underlying causal relationships between variables in a system.\n\n\n\nCausal inference, which aims to answer questions involving cause and effect. As stated before, the starting point for all causal inference is a causal model. Usually, however, we don’t have a good causal model in hand. This is where Causal Discovery can be helpful.\n\nCausal discovery aims to infer causal structure from data. In other words, given a dataset, derive a causal model that describes it.\n\nFinding causal relationships is one of the fundamental tasks in science. A widely used approach is randomized experiments. For example, to examine whether a recently developed medicine is useful for cancer treatment, researchers recruit subjects and randomly divide subjects into two groups. One is the control group, where the subjects are given placebo, and the other is the treatment group, where the subjects are given the newly developed drug. The reason of randomization is to remove possible effects from confounders. For example, age can be one of the possible confounders which affects both taking the drug or not and the treatment effect. Thus, in practical experiments, we should keep the distribution of ages in the two groups almost the same.\n\n\nHow Does It Work ?\n\n\nHowever, in many cases, randomized experiments are very expensive and hard to implement, and sometimes it may even involve ethical issues. In recent decades, inferring causal relations from purely observational data, known as the task of causal discovery, has drawn much attention in machine learning, philosophy, statistics, and computer science.\n\nCausal discovery is an example of an inverse problem. This is like predicting the shape of an ice cube based on the puddle it left on the kitchen counter. Clearly, this is a hard problem, since any number of shapes could generate the same puddle. Connecting this to causality, the puddle of water is like statistical associations embedded in data, and the ice cube is the like underlying causal model.\n\n\nCausal Discovery Assumptions & Properties\n\n\nThe usual approach to solving inverse problems is to make assumptions about what you are trying to uncover. This narrows down the possible solutions and hopefully makes the problem solvable. There are four common assumptions made across causal discovery algorithms.\n\n👉 Acyclicity — Causal structure can be represented by DAG (G)\n\n\n👀 Markov Property — All nodes are independent of their non-descendants when conditioned on their parents\n\n\n🙃 Faithfulness — All conditional independences in true underlying distribution p are represented in G\n\n\n👍 Sufficiency — Any pair of nodes in G has no common external cause\n\n\nAlthough these assumptions help narrow down the number of possible models, they do not fully solve the problem. This is where a few tricks/tests for causal discovery are helpful. There is no single method for causal discovery that dominates all others. Although most methods use the assumptions above (perhaps even more), the details of different algorithms can vary tremendously. A taxonomy of algorithms based on the following tricks is given in the figure below.\n\n\n\n\nConditional Independence Testing\n\n\nOne of these earliest causal discovery algorithms is the PC algorithm named after its authors Peter Spirtes and Clark Glymour. This algorithm (and others like it) use the idea that two statistically independent variables are not causally linked. The PC algorithm is illustrative of this first trick. An outline of the algorithm is given in the figure below.\n\n\n\n\nThe first step is to form a fully connected, undirected graph using every variable in the dataset. Next, edges are deleted if the corresponding variables are independent. Then, connected edges undergo conditional independence testing e.g. independence test of the bottom and far right node conditioned on the middle node in the figure above (step 2).\n\nIf conditioning on a variable kills the dependence, that variable is added to the Separation set for those two variables. Depending on the size of the graph, conditional independence testing will continue (i.e. condition on more variables) until there are no more candidates for testing.\n\nNext, colliders (i.e. X → Y ← Z) are oriented based on the Separation set of node pairs. Finally, the remaining edges are directed based on 2 constraints, 1) no new v-structures, and 2) no directed cycles can be formed.\n\nGreedy Search of Graph Space\n\n\nA greedy search is a way to navigate a space such that you always move in a direction that seems most beneficial based on the local surroundings. Although greedy searches cannot guarantee an optimal solution, for most problems the space of possible DAGs is so big that finding a true optimal solution is intractable. The Greedy Equivalence Search (GES) algorithm uses this trick. GES starts with an empty graph and iteratively adds directed edges such that the improvement in a model fitness measure (i.e. score) is maximized. An example score is the Bayesian Information Criterion (BIC)\n\nExploiting Asymmetries\n\n\n\n\n\nA fundamental property of causality is asymmetry. A could cause B, but B may not cause A. There is a large space of algorithms that leverage this idea to select between causal model candidates.\n\n\n\nFunctional asymmetry assumes models that better fit a relationship are better candidates. For example, given two variables X and Y, the nonlinear additive noise model (NANM) performs a nonlinear regression between X and Y, e.g. y = f(x) + n, where n = noise/residual, in both directions. The model (i.e. causation) is then accepted if the potential cause (e.g. x) is independent of the noise term (e.g. n).\n\n\nConclusion\n\n\nThere is no way I could fit a comprehensive review of causal discovery in a short blog post. Despite being young, causal discovery is a promising field that may help bridge the gap between machine and human knowledge.\n\n\n\n\nReferences\n\n\nCausal Deep Learning\n","html":"<p>This post is the fifth post of the series on <a href=\"http://localhost:2368/tag/causal-machine-learning\">Causal Machine Learning</a>. As stated before, the starting point for all causal inference is a causal model. Usually, however, we don’t have a good causal model in hand. This is where <strong>Causal<strong> </strong>D<strong>iscovery</strong> </strong>can be helpful. In this post Causal Discovery will be discussed in detail.  As always i will try to keep the things as simple as possible. So stay with me , Enjoy reading !</p><!--kg-card-begin: markdown--><h3 id=\"what-is-causal-discovery\">What is Causal Discovery?</h3>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><pre><code>Causal inference focuses on estimating the causal effect of a specific intervention or exposure on an outcome.\n</code></pre>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><pre><code>Causal discovery focuses on identifying the underlying causal relationships between variables in a system.\n</code></pre>\n<!--kg-card-end: markdown--><p><strong>C<strong>ausal inference</strong></strong>, which aims to <em>answer questions involving cause and effect. </em>As stated before, the starting point for all causal inference is a causal model. Usually, however, we don’t have a good causal model in hand. This is where Causal Discovery can be helpful.</p><p><strong><strong>Causal discovery</strong></strong> aims to <strong><strong>infer causal structure from data</strong></strong>. In other words, given a dataset, <em><em>derive</em></em> a causal model that describes it.</p><blockquote>Finding causal relationships is one of the fundamental tasks in science. A widely used approach is <em><strong>randomized experiments</strong></em>. For example, to examine whether a recently developed medicine is useful for cancer treatment, researchers recruit subjects and randomly divide subjects into two groups. One is the control group, where the subjects are given placebo, and the other is the treatment group, where the subjects are given the newly developed drug. The reason of randomization is to remove possible effects from confounders. For example, age can be one of the possible confounders which affects both taking the drug or not and the treatment effect. Thus, in practical experiments, we should keep the distribution of ages in the two groups almost the same.</blockquote><!--kg-card-begin: markdown--><h3 id=\"how-does-it-work\">How Does It Work ?</h3>\n<!--kg-card-end: markdown--><p>However, in many cases, randomized experiments are very expensive and hard to implement, and sometimes it may even involve ethical issues. In recent decades, <strong><em>inferring causal relations from purely observational data, known as the task of causal discovery</em></strong>, has drawn much attention in machine learning, philosophy, statistics, and computer science.</p><p>Causal discovery is an example of an <strong><strong>inverse problem</strong></strong>. This is like predicting the shape of an ice cube based on the puddle it left on the kitchen counter. Clearly, this is a hard problem, since any number of shapes could generate the same puddle. Connecting this to causality, the <em>puddle of water is like statistical associations embedded in data</em>, and the <em>ice cube is the like underlying causal model</em>.</p><!--kg-card-begin: markdown--><h3 id=\"causal-discovery-assumptions-properties\">Causal Discovery Assumptions &amp; Properties</h3>\n<!--kg-card-end: markdown--><p>The usual approach to solving inverse problems is to make assumptions about what you are trying to uncover. This narrows down the possible solutions and hopefully makes the problem solvable. There are four common assumptions made across causal discovery algorithms. </p><!--kg-card-begin: markdown--><p>👉 <strong>Acyclicity</strong> <span style=\"color:orange\">— Causal structure can be represented by DAG (G)</span></p>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><p>👀  <strong>Markov Property</strong> <span style=\"color:blue\"> — All nodes are independent of their non-descendants when conditioned on their parents</span></p>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><p>🙃  <strong>Faithfulness</strong> <span style=\"color:green\">— All conditional independences in true underlying distribution p are represented in G</span></p>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><p>👍 <strong>Sufficiency</strong> <span style=\"color:red\">— Any pair of nodes in G has no common external cause</span></p>\n<!--kg-card-end: markdown--><p>Although these assumptions help narrow down the number of possible models, they do not fully solve the problem. This is where a few tricks/tests for causal discovery are helpful. There is no single method for causal discovery that dominates all others. Although most methods use the assumptions above (perhaps even more), the details of different algorithms can vary tremendously. A taxonomy of algorithms based on the following tricks is given in the figure below.</p><!--kg-card-begin: markdown--><p><img src=\"https://user-images.githubusercontent.com/33357428/219991483-bd875dbc-d04d-4bca-a1db-1b553162f199.png\" alt=\"causaldiscoveryalgo\" loading=\"lazy\"></p>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><h4 id=\"conditional-independence-testing\">Conditional Independence Testing</h4>\n<!--kg-card-end: markdown--><p>One of these earliest causal discovery algorithms is the <strong><strong>PC algorithm</strong></strong> named after its authors Peter Spirtes and Clark Glymour. This algorithm (and others like it) use the idea that <strong><strong>two</strong> <strong>statistically independent variables are not causally linked</strong></strong>. The PC algorithm is illustrative of this first trick. An outline of the algorithm is given in the figure below.</p><!--kg-card-begin: markdown--><p><img src=\"https://user-images.githubusercontent.com/33357428/219992079-8d7af535-e4bd-48e0-93a6-a993f65b84cb.png\" alt=\"pcalgo\" loading=\"lazy\"></p>\n<!--kg-card-end: markdown--><p>The first step is to form a fully connected, undirected graph using every variable in the dataset. Next, edges are deleted if the corresponding variables are independent. Then, connected edges undergo conditional independence testing e.g. independence test of the bottom and far right node conditioned on the middle node in the figure above (step 2).</p><p>If conditioning on a variable kills the dependence, that variable is added to the Separation set for those two variables. Depending on the size of the graph, conditional independence testing will continue (i.e. condition on more variables) until there are no more candidates for testing.</p><p>Next, colliders (i.e. X → Y ← Z) are oriented based on the Separation set of node pairs. Finally, the remaining edges are directed based on 2 constraints, 1) no new v-structures, and 2) no directed cycles can be formed.</p><!--kg-card-begin: markdown--><h4 id=\"greedy-search-of-graph-space\">Greedy Search of Graph Space</h4>\n<!--kg-card-end: markdown--><p>A greedy search is a way to navigate a space such that you always move in a direction that <em><em>seems</em></em> most beneficial based on the local surroundings. Although greedy searches cannot guarantee an optimal solution, for most problems the space of possible DAGs is so big that finding a <em><em>true</em></em> optimal solution is intractable. The <strong><strong>Greedy Equivalence Search (GES)</strong></strong> algorithm uses this trick. GES starts with an empty graph and iteratively adds directed edges such that the improvement in a model fitness measure (i.e. score) is maximized. An example score is the Bayesian Information Criterion (BIC)</p><!--kg-card-begin: markdown--><h4 id=\"exploiting-asymmetries\">Exploiting Asymmetries</h4>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><blockquote>\n<p>A fundamental property of causality is asymmetry. A could cause B, but B may not cause A. There is a large space of algorithms that leverage this idea to select between causal model candidates.</p>\n</blockquote>\n<!--kg-card-end: markdown--><p><strong><strong>Functional asymmetry</strong></strong> assumes <strong><strong>models that better fit</strong></strong> a relationship <strong><strong>are better candidates</strong></strong>. For example, given two variables X and Y, the <strong><strong>nonlinear additive noise model (NANM)</strong></strong> performs a nonlinear regression between X and Y, e.g. y = f(x) + n, where n = noise/residual, in both directions. The model (i.e. causation) is then accepted if the potential cause (e.g. x) is independent of the noise term (e.g. n).</p><!--kg-card-begin: markdown--><h3 id=\"conclusion\">Conclusion</h3>\n<!--kg-card-end: markdown--><p>There is no way I could fit a comprehensive review of causal discovery in a short blog post. Despite being young, causal discovery is a promising field that may help bridge the gap between machine and human <em><em>knowledge</em></em>.</p><p></p><!--kg-card-begin: markdown--><h3 id=\"references\">References</h3>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><p><a href=\"https://www.vanderschaar-lab.com/causal-deep-learning/\">Causal Deep Learning</a></p>\n<!--kg-card-end: markdown-->","url":"http://localhost:2368/causal-machine-learning-part-5/","canonical_url":null,"uuid":"c7bf04a7-9abc-4857-9afe-c06790a1c5e3","codeinjection_foot":null,"codeinjection_head":null,"codeinjection_styles":null,"comment_id":"63b58208bdc6867fe35526ff","reading_time":4,"send_email_when_published":null,"email_subject":null,"childHtmlRehype":{"html":"<p>This post is the fifth post of the series on <a href=\"/tag/causal-machine-learning\">Causal Machine Learning</a>. As stated before, the starting point for all causal inference is a causal model. Usually, however, we don’t have a good causal model in hand. This is where <strong>Causal<strong> </strong>D<strong>iscovery</strong> </strong>can be helpful. In this post Causal Discovery will be discussed in detail.  As always i will try to keep the things as simple as possible. So stay with me , Enjoy reading !</p><!--kg-card-begin: markdown--><h3 id=\"what-is-causal-discovery\">What is Causal Discovery?</h3>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><div class=\"kg-card kg-code-card gatsby-highlight\" data-language=\"text\"><pre class=\"language-text\"><code class=\"language-text\">Causal inference focuses on estimating the causal effect of a specific intervention or exposure on an outcome.\n</code></pre></div>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><div class=\"kg-card kg-code-card gatsby-highlight\" data-language=\"text\"><pre class=\"language-text\"><code class=\"language-text\">Causal discovery focuses on identifying the underlying causal relationships between variables in a system.\n</code></pre></div>\n<!--kg-card-end: markdown--><p><strong>C<strong>ausal inference</strong></strong>, which aims to <em>answer questions involving cause and effect. </em>As stated before, the starting point for all causal inference is a causal model. Usually, however, we don’t have a good causal model in hand. This is where Causal Discovery can be helpful.</p><p><strong><strong>Causal discovery</strong></strong> aims to <strong><strong>infer causal structure from data</strong></strong>. In other words, given a dataset, <em><em>derive</em></em> a causal model that describes it.</p><blockquote>Finding causal relationships is one of the fundamental tasks in science. A widely used approach is <em><strong>randomized experiments</strong></em>. For example, to examine whether a recently developed medicine is useful for cancer treatment, researchers recruit subjects and randomly divide subjects into two groups. One is the control group, where the subjects are given placebo, and the other is the treatment group, where the subjects are given the newly developed drug. The reason of randomization is to remove possible effects from confounders. For example, age can be one of the possible confounders which affects both taking the drug or not and the treatment effect. Thus, in practical experiments, we should keep the distribution of ages in the two groups almost the same.</blockquote><!--kg-card-begin: markdown--><h3 id=\"how-does-it-work\">How Does It Work ?</h3>\n<!--kg-card-end: markdown--><p>However, in many cases, randomized experiments are very expensive and hard to implement, and sometimes it may even involve ethical issues. In recent decades, <strong><em>inferring causal relations from purely observational data, known as the task of causal discovery</em></strong>, has drawn much attention in machine learning, philosophy, statistics, and computer science.</p><p>Causal discovery is an example of an <strong><strong>inverse problem</strong></strong>. This is like predicting the shape of an ice cube based on the puddle it left on the kitchen counter. Clearly, this is a hard problem, since any number of shapes could generate the same puddle. Connecting this to causality, the <em>puddle of water is like statistical associations embedded in data</em>, and the <em>ice cube is the like underlying causal model</em>.</p><!--kg-card-begin: markdown--><h3 id=\"causal-discovery-assumptions-properties\">Causal Discovery Assumptions &#x26; Properties</h3>\n<!--kg-card-end: markdown--><p>The usual approach to solving inverse problems is to make assumptions about what you are trying to uncover. This narrows down the possible solutions and hopefully makes the problem solvable. There are four common assumptions made across causal discovery algorithms. </p><!--kg-card-begin: markdown--><p>👉 <strong>Acyclicity</strong> <span style=\"color:orange\">— Causal structure can be represented by DAG (G)</span></p>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><p>👀  <strong>Markov Property</strong> <span style=\"color:blue\"> — All nodes are independent of their non-descendants when conditioned on their parents</span></p>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><p>🙃  <strong>Faithfulness</strong> <span style=\"color:green\">— All conditional independences in true underlying distribution p are represented in G</span></p>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><p>👍 <strong>Sufficiency</strong> <span style=\"color:red\">— Any pair of nodes in G has no common external cause</span></p>\n<!--kg-card-end: markdown--><p>Although these assumptions help narrow down the number of possible models, they do not fully solve the problem. This is where a few tricks/tests for causal discovery are helpful. There is no single method for causal discovery that dominates all others. Although most methods use the assumptions above (perhaps even more), the details of different algorithms can vary tremendously. A taxonomy of algorithms based on the following tricks is given in the figure below.</p><!--kg-card-begin: markdown--><p><img src=\"https://user-images.githubusercontent.com/33357428/219991483-bd875dbc-d04d-4bca-a1db-1b553162f199.png\" alt=\"causaldiscoveryalgo\" loading=\"lazy\"></p>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><h4 id=\"conditional-independence-testing\">Conditional Independence Testing</h4>\n<!--kg-card-end: markdown--><p>One of these earliest causal discovery algorithms is the <strong><strong>PC algorithm</strong></strong> named after its authors Peter Spirtes and Clark Glymour. This algorithm (and others like it) use the idea that <strong><strong>two</strong> <strong>statistically independent variables are not causally linked</strong></strong>. The PC algorithm is illustrative of this first trick. An outline of the algorithm is given in the figure below.</p><!--kg-card-begin: markdown--><p><img src=\"https://user-images.githubusercontent.com/33357428/219992079-8d7af535-e4bd-48e0-93a6-a993f65b84cb.png\" alt=\"pcalgo\" loading=\"lazy\"></p>\n<!--kg-card-end: markdown--><p>The first step is to form a fully connected, undirected graph using every variable in the dataset. Next, edges are deleted if the corresponding variables are independent. Then, connected edges undergo conditional independence testing e.g. independence test of the bottom and far right node conditioned on the middle node in the figure above (step 2).</p><p>If conditioning on a variable kills the dependence, that variable is added to the Separation set for those two variables. Depending on the size of the graph, conditional independence testing will continue (i.e. condition on more variables) until there are no more candidates for testing.</p><p>Next, colliders (i.e. X → Y ← Z) are oriented based on the Separation set of node pairs. Finally, the remaining edges are directed based on 2 constraints, 1) no new v-structures, and 2) no directed cycles can be formed.</p><!--kg-card-begin: markdown--><h4 id=\"greedy-search-of-graph-space\">Greedy Search of Graph Space</h4>\n<!--kg-card-end: markdown--><p>A greedy search is a way to navigate a space such that you always move in a direction that <em><em>seems</em></em> most beneficial based on the local surroundings. Although greedy searches cannot guarantee an optimal solution, for most problems the space of possible DAGs is so big that finding a <em><em>true</em></em> optimal solution is intractable. The <strong><strong>Greedy Equivalence Search (GES)</strong></strong> algorithm uses this trick. GES starts with an empty graph and iteratively adds directed edges such that the improvement in a model fitness measure (i.e. score) is maximized. An example score is the Bayesian Information Criterion (BIC)</p><!--kg-card-begin: markdown--><h4 id=\"exploiting-asymmetries\">Exploiting Asymmetries</h4>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><blockquote>\n<p>A fundamental property of causality is asymmetry. A could cause B, but B may not cause A. There is a large space of algorithms that leverage this idea to select between causal model candidates.</p>\n</blockquote>\n<!--kg-card-end: markdown--><p><strong><strong>Functional asymmetry</strong></strong> assumes <strong><strong>models that better fit</strong></strong> a relationship <strong><strong>are better candidates</strong></strong>. For example, given two variables X and Y, the <strong><strong>nonlinear additive noise model (NANM)</strong></strong> performs a nonlinear regression between X and Y, e.g. y = f(x) + n, where n = noise/residual, in both directions. The model (i.e. causation) is then accepted if the potential cause (e.g. x) is independent of the noise term (e.g. n).</p><!--kg-card-begin: markdown--><h3 id=\"conclusion\">Conclusion</h3>\n<!--kg-card-end: markdown--><p>There is no way I could fit a comprehensive review of causal discovery in a short blog post. Despite being young, causal discovery is a promising field that may help bridge the gap between machine and human <em><em>knowledge</em></em>.</p><p></p><!--kg-card-begin: markdown--><h3 id=\"references\">References</h3>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><p><a href=\"https://www.vanderschaar-lab.com/causal-deep-learning/\">Causal Deep Learning</a></p>\n<!--kg-card-end: markdown-->","htmlAst":{"type":"root","children":[{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"This post is the fifth post of the series on "},{"type":"element","tagName":"a","properties":{"href":"/tag/causal-machine-learning"},"children":[{"type":"text","value":"Causal Machine Learning"}]},{"type":"text","value":". As stated before, the starting point for all causal inference is a causal model. Usually, however, we don’t have a good causal model in hand. This is where "},{"type":"element","tagName":"strong","properties":{},"children":[{"type":"text","value":"Causal"},{"type":"element","tagName":"strong","properties":{},"children":[{"type":"text","value":" "}]},{"type":"text","value":"D"},{"type":"element","tagName":"strong","properties":{},"children":[{"type":"text","value":"iscovery"}]},{"type":"text","value":" "}]},{"type":"text","value":"can be helpful. In this post Causal Discovery will be discussed in detail.  As always i will try to keep the things as simple as possible. So stay with me , Enjoy reading !"}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h3","properties":{"id":"what-is-causal-discovery"},"children":[{"type":"text","value":"What is Causal Discovery?"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"div","properties":{"className":["kg-card","kg-code-card","gatsby-highlight"],"dataLanguage":"text"},"children":[{"type":"element","tagName":"pre","properties":{"className":["language-text"]},"children":[{"type":"element","tagName":"code","properties":{"className":["language-text"]},"children":[{"type":"text","value":"Causal inference focuses on estimating the causal effect of a specific intervention or exposure on an outcome.\n"}]}]}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"div","properties":{"className":["kg-card","kg-code-card","gatsby-highlight"],"dataLanguage":"text"},"children":[{"type":"element","tagName":"pre","properties":{"className":["language-text"]},"children":[{"type":"element","tagName":"code","properties":{"className":["language-text"]},"children":[{"type":"text","value":"Causal discovery focuses on identifying the underlying causal relationships between variables in a system.\n"}]}]}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"element","tagName":"strong","properties":{},"children":[{"type":"text","value":"C"},{"type":"element","tagName":"strong","properties":{},"children":[{"type":"text","value":"ausal inference"}]}]},{"type":"text","value":", which aims to "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"answer questions involving cause and effect. "}]},{"type":"text","value":"As stated before, the starting point for all causal inference is a causal model. Usually, however, we don’t have a good causal model in hand. This is where Causal Discovery can be helpful."}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"element","tagName":"strong","properties":{},"children":[{"type":"element","tagName":"strong","properties":{},"children":[{"type":"text","value":"Causal discovery"}]}]},{"type":"text","value":" aims to "},{"type":"element","tagName":"strong","properties":{},"children":[{"type":"element","tagName":"strong","properties":{},"children":[{"type":"text","value":"infer causal structure from data"}]}]},{"type":"text","value":". In other words, given a dataset, "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"derive"}]}]},{"type":"text","value":" a causal model that describes it."}]},{"type":"element","tagName":"blockquote","properties":{},"children":[{"type":"text","value":"Finding causal relationships is one of the fundamental tasks in science. A widely used approach is "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"element","tagName":"strong","properties":{},"children":[{"type":"text","value":"randomized experiments"}]}]},{"type":"text","value":". For example, to examine whether a recently developed medicine is useful for cancer treatment, researchers recruit subjects and randomly divide subjects into two groups. One is the control group, where the subjects are given placebo, and the other is the treatment group, where the subjects are given the newly developed drug. The reason of randomization is to remove possible effects from confounders. For example, age can be one of the possible confounders which affects both taking the drug or not and the treatment effect. Thus, in practical experiments, we should keep the distribution of ages in the two groups almost the same."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h3","properties":{"id":"how-does-it-work"},"children":[{"type":"text","value":"How Does It Work ?"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"However, in many cases, randomized experiments are very expensive and hard to implement, and sometimes it may even involve ethical issues. In recent decades, "},{"type":"element","tagName":"strong","properties":{},"children":[{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"inferring causal relations from purely observational data, known as the task of causal discovery"}]}]},{"type":"text","value":", has drawn much attention in machine learning, philosophy, statistics, and computer science."}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Causal discovery is an example of an "},{"type":"element","tagName":"strong","properties":{},"children":[{"type":"element","tagName":"strong","properties":{},"children":[{"type":"text","value":"inverse problem"}]}]},{"type":"text","value":". This is like predicting the shape of an ice cube based on the puddle it left on the kitchen counter. Clearly, this is a hard problem, since any number of shapes could generate the same puddle. Connecting this to causality, the "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"puddle of water is like statistical associations embedded in data"}]},{"type":"text","value":", and the "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"ice cube is the like underlying causal model"}]},{"type":"text","value":"."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h3","properties":{"id":"causal-discovery-assumptions-properties"},"children":[{"type":"text","value":"Causal Discovery Assumptions & Properties"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"The usual approach to solving inverse problems is to make assumptions about what you are trying to uncover. This narrows down the possible solutions and hopefully makes the problem solvable. There are four common assumptions made across causal discovery algorithms. "}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"👉 "},{"type":"element","tagName":"strong","properties":{},"children":[{"type":"text","value":"Acyclicity"}]},{"type":"text","value":" "},{"type":"element","tagName":"span","properties":{"style":"color:orange"},"children":[{"type":"text","value":"— Causal structure can be represented by DAG (G)"}]}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"👀  "},{"type":"element","tagName":"strong","properties":{},"children":[{"type":"text","value":"Markov Property"}]},{"type":"text","value":" "},{"type":"element","tagName":"span","properties":{"style":"color:blue"},"children":[{"type":"text","value":" — All nodes are independent of their non-descendants when conditioned on their parents"}]}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"🙃  "},{"type":"element","tagName":"strong","properties":{},"children":[{"type":"text","value":"Faithfulness"}]},{"type":"text","value":" "},{"type":"element","tagName":"span","properties":{"style":"color:green"},"children":[{"type":"text","value":"— All conditional independences in true underlying distribution p are represented in G"}]}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"👍 "},{"type":"element","tagName":"strong","properties":{},"children":[{"type":"text","value":"Sufficiency"}]},{"type":"text","value":" "},{"type":"element","tagName":"span","properties":{"style":"color:red"},"children":[{"type":"text","value":"— Any pair of nodes in G has no common external cause"}]}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Although these assumptions help narrow down the number of possible models, they do not fully solve the problem. This is where a few tricks/tests for causal discovery are helpful. There is no single method for causal discovery that dominates all others. Although most methods use the assumptions above (perhaps even more), the details of different algorithms can vary tremendously. A taxonomy of algorithms based on the following tricks is given in the figure below."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/219991483-bd875dbc-d04d-4bca-a1db-1b553162f199.png","alt":"causaldiscoveryalgo","loading":"lazy"},"children":[]}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h4","properties":{"id":"conditional-independence-testing"},"children":[{"type":"text","value":"Conditional Independence Testing"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"One of these earliest causal discovery algorithms is the "},{"type":"element","tagName":"strong","properties":{},"children":[{"type":"element","tagName":"strong","properties":{},"children":[{"type":"text","value":"PC algorithm"}]}]},{"type":"text","value":" named after its authors Peter Spirtes and Clark Glymour. This algorithm (and others like it) use the idea that "},{"type":"element","tagName":"strong","properties":{},"children":[{"type":"element","tagName":"strong","properties":{},"children":[{"type":"text","value":"two"}]},{"type":"text","value":" "},{"type":"element","tagName":"strong","properties":{},"children":[{"type":"text","value":"statistically independent variables are not causally linked"}]}]},{"type":"text","value":". The PC algorithm is illustrative of this first trick. An outline of the algorithm is given in the figure below."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/219992079-8d7af535-e4bd-48e0-93a6-a993f65b84cb.png","alt":"pcalgo","loading":"lazy"},"children":[]}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"The first step is to form a fully connected, undirected graph using every variable in the dataset. Next, edges are deleted if the corresponding variables are independent. Then, connected edges undergo conditional independence testing e.g. independence test of the bottom and far right node conditioned on the middle node in the figure above (step 2)."}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"If conditioning on a variable kills the dependence, that variable is added to the Separation set for those two variables. Depending on the size of the graph, conditional independence testing will continue (i.e. condition on more variables) until there are no more candidates for testing."}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Next, colliders (i.e. X → Y ← Z) are oriented based on the Separation set of node pairs. Finally, the remaining edges are directed based on 2 constraints, 1) no new v-structures, and 2) no directed cycles can be formed."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h4","properties":{"id":"greedy-search-of-graph-space"},"children":[{"type":"text","value":"Greedy Search of Graph Space"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"A greedy search is a way to navigate a space such that you always move in a direction that "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"seems"}]}]},{"type":"text","value":" most beneficial based on the local surroundings. Although greedy searches cannot guarantee an optimal solution, for most problems the space of possible DAGs is so big that finding a "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"true"}]}]},{"type":"text","value":" optimal solution is intractable. The "},{"type":"element","tagName":"strong","properties":{},"children":[{"type":"element","tagName":"strong","properties":{},"children":[{"type":"text","value":"Greedy Equivalence Search (GES)"}]}]},{"type":"text","value":" algorithm uses this trick. GES starts with an empty graph and iteratively adds directed edges such that the improvement in a model fitness measure (i.e. score) is maximized. An example score is the Bayesian Information Criterion (BIC)"}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h4","properties":{"id":"exploiting-asymmetries"},"children":[{"type":"text","value":"Exploiting Asymmetries"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"blockquote","properties":{},"children":[{"type":"text","value":"\n"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"A fundamental property of causality is asymmetry. A could cause B, but B may not cause A. There is a large space of algorithms that leverage this idea to select between causal model candidates."}]},{"type":"text","value":"\n"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"element","tagName":"strong","properties":{},"children":[{"type":"element","tagName":"strong","properties":{},"children":[{"type":"text","value":"Functional asymmetry"}]}]},{"type":"text","value":" assumes "},{"type":"element","tagName":"strong","properties":{},"children":[{"type":"element","tagName":"strong","properties":{},"children":[{"type":"text","value":"models that better fit"}]}]},{"type":"text","value":" a relationship "},{"type":"element","tagName":"strong","properties":{},"children":[{"type":"element","tagName":"strong","properties":{},"children":[{"type":"text","value":"are better candidates"}]}]},{"type":"text","value":". For example, given two variables X and Y, the "},{"type":"element","tagName":"strong","properties":{},"children":[{"type":"element","tagName":"strong","properties":{},"children":[{"type":"text","value":"nonlinear additive noise model (NANM)"}]}]},{"type":"text","value":" performs a nonlinear regression between X and Y, e.g. y = f(x) + n, where n = noise/residual, in both directions. The model (i.e. causation) is then accepted if the potential cause (e.g. x) is independent of the noise term (e.g. n)."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h3","properties":{"id":"conclusion"},"children":[{"type":"text","value":"Conclusion"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"There is no way I could fit a comprehensive review of causal discovery in a short blog post. Despite being young, causal discovery is a promising field that may help bridge the gap between machine and human "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"knowledge"}]}]},{"type":"text","value":"."}]},{"type":"element","tagName":"p","properties":{},"children":[]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h3","properties":{"id":"references"},"children":[{"type":"text","value":"References"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"element","tagName":"a","properties":{"href":"https://www.vanderschaar-lab.com/causal-deep-learning/"},"children":[{"type":"text","value":"Causal Deep Learning"}]}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"}],"data":{"quirksMode":false}},"tableOfContents":[{"id":"what-is-causal-discovery","heading":"What is Causal Discovery?"},{"id":"how-does-it-work","heading":"How Does It Work ?"},{"id":"causal-discovery-assumptions-properties","heading":"Causal Discovery Assumptions & Properties","items":[{"id":"conditional-independence-testing","heading":"Conditional Independence Testing"},{"id":"greedy-search-of-graph-space","heading":"Greedy Search of Graph Space"},{"id":"exploiting-asymmetries","heading":"Exploiting Asymmetries"}]},{"id":"conclusion","heading":"Conclusion"},{"id":"references","heading":"References"}]},"featureImageSharp":{"base":"photo-1648007547791-404a2abfdc82.jpg","publicURL":"/static/595735875a5d9c06db59e957ae992251/photo-1648007547791-404a2abfdc82.jpg","imageMeta":{"width":2000,"height":1335},"childImageSharp":{"fluid":{"base64":"data:image/jpeg;base64,/9j/2wBDABALDA4MChAODQ4SERATGCgaGBYWGDEjJR0oOjM9PDkzODdASFxOQERXRTc4UG1RV19iZ2hnPk1xeXBkeFxlZ2P/2wBDARESEhgVGC8aGi9jQjhCY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2P/wgARCAANABQDASIAAhEBAxEB/8QAFwAAAwEAAAAAAAAAAAAAAAAAAAMEAv/EABUBAQEAAAAAAAAAAAAAAAAAAAID/9oADAMBAAIQAxAAAAFmYlypMOEP/8QAGhABAAIDAQAAAAAAAAAAAAAAAQACAxESIf/aAAgBAQABBQJKQzPGT2/TWK6dT//EABQRAQAAAAAAAAAAAAAAAAAAABD/2gAIAQMBAT8BP//EABQRAQAAAAAAAAAAAAAAAAAAABD/2gAIAQIBAT8BP//EABsQAAIDAAMAAAAAAAAAAAAAAAERAAIQIjFB/9oACAEBAAY/AmaOcqh+YEc6n//EABoQAQACAwEAAAAAAAAAAAAAAAEAESExQVH/2gAIAQEAAT8hp0EYGpZnMRwhETUMoeyltxcsdZ//2gAMAwEAAgADAAAAEFs//8QAFhEBAQEAAAAAAAAAAAAAAAAAAQAh/9oACAEDAQE/EBDZNv/EABYRAQEBAAAAAAAAAAAAAAAAAAERAP/aAAgBAgEBPxBFJhhN/8QAGhABAAMBAQEAAAAAAAAAAAAAAQARIVExQf/aAAgBAQABPxBLAjuaHjXyUnNA0ibrEVDFXBTQRYrrEhat907HdCn/2Q==","aspectRatio":1.4942528735632183,"src":"/static/595735875a5d9c06db59e957ae992251/d5c54/photo-1648007547791-404a2abfdc82.jpg","srcSet":"/static/595735875a5d9c06db59e957ae992251/65d8c/photo-1648007547791-404a2abfdc82.jpg 260w,\n/static/595735875a5d9c06db59e957ae992251/c5f21/photo-1648007547791-404a2abfdc82.jpg 520w,\n/static/595735875a5d9c06db59e957ae992251/d5c54/photo-1648007547791-404a2abfdc82.jpg 1040w,\n/static/595735875a5d9c06db59e957ae992251/81a53/photo-1648007547791-404a2abfdc82.jpg 1560w,\n/static/595735875a5d9c06db59e957ae992251/4e5f3/photo-1648007547791-404a2abfdc82.jpg 2000w","srcWebp":"/static/595735875a5d9c06db59e957ae992251/e4875/photo-1648007547791-404a2abfdc82.webp","srcSetWebp":"/static/595735875a5d9c06db59e957ae992251/dc8f3/photo-1648007547791-404a2abfdc82.webp 260w,\n/static/595735875a5d9c06db59e957ae992251/2db4b/photo-1648007547791-404a2abfdc82.webp 520w,\n/static/595735875a5d9c06db59e957ae992251/e4875/photo-1648007547791-404a2abfdc82.webp 1040w,\n/static/595735875a5d9c06db59e957ae992251/f5845/photo-1648007547791-404a2abfdc82.webp 1560w,\n/static/595735875a5d9c06db59e957ae992251/49d6b/photo-1648007547791-404a2abfdc82.webp 2000w","sizes":"(max-width: 1040px) 100vw, 1040px"}}}},"prev":{"id":"Ghost__Post__63eeea8172a3c427182edcc0","title":"Explaining Bayesian Neural Networks","slug":"explaining-bayesian-neural-networks","featured":false,"feature_image":"https://images.unsplash.com/photo-1646583288948-24548aedffd8?crop=entropy&cs=tinysrgb&fit=max&fm=jpg&ixid=MnwxMTc3M3wwfDF8c2VhcmNofDc2fHxhaXxlbnwwfHx8fDE2NzY2MDIwNTY&ixlib=rb-4.0.3&q=80&w=2000","excerpt":"Explainable AI refers to models that can easily be interpreted by humans. Bayesian inferencing is a natural choice to obtain explanations of trained deep learning models.","custom_excerpt":"Explainable AI refers to models that can easily be interpreted by humans. Bayesian inferencing is a natural choice to obtain explanations of trained deep learning models.","visibility":"public","created_at_pretty":"17 Feb 2023","published_at_pretty":"17 Feb 2023","updated_at_pretty":"17 Feb 2023","created_at":"2023-02-17T08:16:25.000+05:30","published_at":"2023-02-17T08:22:35.000+05:30","updated_at":"2023-02-17T08:23:09.000+05:30","meta_title":null,"meta_description":null,"og_description":null,"og_image":null,"og_title":null,"twitter_description":null,"twitter_image":null,"twitter_title":null,"authors":[{"slug":"amaljith","url":"http://localhost:2368/author/amaljith/","name":"Amaljith","bio":"Research Scholar @ IIT Kharagpur","cover_image":null,"profile_image":"http://localhost:2368/content/images/2022/09/Screenshot-from-2022-09-07-18-00-00.png","location":null,"website":null,"twitter":null,"facebook":null,"meta_title":null,"meta_description":null,"coverImageSharp":null,"profileImageSharp":null}],"primary_author":{"slug":"amaljith","url":"http://localhost:2368/author/amaljith/","name":"Amaljith","bio":"Research Scholar @ IIT Kharagpur","cover_image":null,"profile_image":"http://localhost:2368/content/images/2022/09/Screenshot-from-2022-09-07-18-00-00.png","location":null,"website":null,"twitter":null,"facebook":null,"meta_title":null,"meta_description":null,"coverImageSharp":null,"profileImageSharp":{"base":"Screenshot-from-2022-09-07-18-00-00.png","publicURL":"/static/28e31bfedd96b4afe90237d2c1f700c3/Screenshot-from-2022-09-07-18-00-00.png","imageMeta":{"width":316,"height":237},"childImageSharp":{"fluid":{"base64":"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAAPCAYAAADkmO9VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAEJElEQVQ4yyXO21fTBQDA8d9fUh5LQAW1eEImbIPJbSB3kZtijjSBxNvAGSAqlJZ64GAHIQ0VjkKICCiCQ8cYsYFjYwhsYxeGjJsopmEnH76dYw+f94/w2jmKfaSX9wtTrC1a+bBk5d83dlbnzFhHehnq62TBZeLjGyfvFqf4a9nGstuAx/oM96Qai76NieFu5u0jLLrNCCsOI47nat56J1hbtLH60kTfg5tUV6hQFeSSFLGTgtwcBtUteGc0zLoGmXNpmXU8Yd6txTOlxvq8hzmr/hNhyTaC06hmya3nn6VxHKMPKS08RFF+HgWK/aiO5JP3TQ4xMgn3mqt599bC8pyBRY+OxRkdXkc/rrE+POZ+nMY+hFXHKPPWP5m3a/i4Yqax9kfCREEkyKPJSk3mQFYmOelpRISGEBMuwahv4/WC4dNuwa3l1Us906O9jGs7sWg7EHQdv6NurWVmvJv33iHKTuSyft06tm3xJygwEIlIxDb/zUhFIvw2+HDrxkX+XjUx5x7A6+pn3qXBZnyIoacFTftNhKarlTxprsMz3sPbWR2jmiYiJSICt27lqwB/tmza+L8Af5KkYsYNHXhcvbidahy2x0ya2jFqm7lxqYxrF04jvLQMYB16jKazHrOulRWHjiJFOus/+5yAjX5s8vNhs58vfj5foooO43LJMaovncI1/ZhpWxcTlk6mRjrpa7uOrqsJYcaiY25imKftv6F52IBx4D61Z46y6Yv1+PpsYLOvD36+G0gJDuRcrIS0nTvY8XUAl66osDq60A3e5sHtXzCoW3CY+hHeeKdZdL5gytDDgsOAe1zL9KiawowEsoK2URi+HWWMmKvZuzibKuNcqoTiWBEXShToLHfo7K6h+9413DYzS/MehJnJYZxjOowDHdjHnjE53M2kUU1/zVmu742lOlNOkyKRmtzd1KkUtObFcacgmUetVTT0/kTDvQpemPoxDWmoralG8NrNvJqZwmbS8Xp5FrfNhMtuxtNei75Kia3xAmtdVZgaf6axvprKvXH8kB5FXd1lFMcyuVp7Hnd9E0+bb5Cdvgfhw4oXPq5hGR5kzGzkUVcPVeWlPK8tx/BrCfcrjnI+bx+KPSmcPJxP2QklFUolRfmHyf/uEEXK49y9XMndaxdJjpQirCy4WLCPoe38A4Nez8XySjISU5CFhCIO3k5wUDASkZiUuHj2Z2ahyN5HvuIgRUeOcUZZTOlJJeUlKgbU7Txta0AYbLnFqewMrpSepvhIITm700nblYg8IpJIWRjyiAjiI6JJjJKTGBtHcnwCqYlJpCelkJOWwfe533Jw337Kik9g1N5HqC9RcfyAgvTUPcRHyYmWypCFiJGKQggNDkYsEiEJEhEmCiE8REyYWIJMKiVCEoY8fCcJUTHEx8QgCw0l/8Be/gOTiTiUD46AHAAAAABJRU5ErkJggg==","aspectRatio":1.3333333333333333,"src":"/static/28e31bfedd96b4afe90237d2c1f700c3/6ccb0/Screenshot-from-2022-09-07-18-00-00.png","srcSet":"/static/28e31bfedd96b4afe90237d2c1f700c3/7d89d/Screenshot-from-2022-09-07-18-00-00.png 28w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/f4091/Screenshot-from-2022-09-07-18-00-00.png 55w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/6ccb0/Screenshot-from-2022-09-07-18-00-00.png 110w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/30481/Screenshot-from-2022-09-07-18-00-00.png 165w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/33bd6/Screenshot-from-2022-09-07-18-00-00.png 220w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/d398b/Screenshot-from-2022-09-07-18-00-00.png 316w","srcWebp":"/static/28e31bfedd96b4afe90237d2c1f700c3/8678c/Screenshot-from-2022-09-07-18-00-00.webp","srcSetWebp":"/static/28e31bfedd96b4afe90237d2c1f700c3/59cda/Screenshot-from-2022-09-07-18-00-00.webp 28w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/7da75/Screenshot-from-2022-09-07-18-00-00.webp 55w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/8678c/Screenshot-from-2022-09-07-18-00-00.webp 110w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/f282e/Screenshot-from-2022-09-07-18-00-00.webp 165w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/a7b21/Screenshot-from-2022-09-07-18-00-00.webp 220w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/fb2b8/Screenshot-from-2022-09-07-18-00-00.webp 316w","sizes":"(max-width: 110px) 100vw, 110px"}}}},"primary_tag":{"slug":"explainable-ai","url":"http://localhost:2368/tag/explainable-ai/","name":"Explainable AI","visibility":"public","feature_image":null,"description":null,"meta_title":null,"meta_description":null,"featureImageSharp":null},"tags":[{"slug":"explainable-ai","url":"http://localhost:2368/tag/explainable-ai/","name":"Explainable AI","visibility":"public","feature_image":null,"description":null,"meta_title":null,"meta_description":null,"featureImageSharp":null},{"slug":"artificial-intelligence","url":"http://localhost:2368/tag/artificial-intelligence/","name":"Artificial Intelligence","visibility":"public","feature_image":null,"description":null,"meta_title":null,"meta_description":null,"featureImageSharp":null},{"slug":"machine-learning","url":"http://localhost:2368/tag/machine-learning/","name":"Machine Learning","visibility":"public","feature_image":null,"description":null,"meta_title":null,"meta_description":null,"featureImageSharp":null}],"plaintext":"Explainable AI refers to models that can easily be interpreted by humans. Bayesian inferencing is a natural choice to obtain explanations of trained deep learning models.","html":"<p>Explainable AI refers to models that can easily be interpreted by humans. Bayesian inferencing is a natural choice to obtain explanations of trained deep learning models.</p>","url":"http://localhost:2368/explaining-bayesian-neural-networks/","canonical_url":null,"uuid":"6bef7972-5e59-4ea0-8ccd-99292bb992ab","codeinjection_foot":null,"codeinjection_head":null,"codeinjection_styles":null,"comment_id":"63eeea8172a3c427182edcc0","reading_time":0,"send_email_when_published":null,"email_subject":null,"childHtmlRehype":{"html":"<p>Explainable AI refers to models that can easily be interpreted by humans. Bayesian inferencing is a natural choice to obtain explanations of trained deep learning models.</p>","htmlAst":{"type":"root","children":[{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Explainable AI refers to models that can easily be interpreted by humans. Bayesian inferencing is a natural choice to obtain explanations of trained deep learning models."}]}],"data":{"quirksMode":false}},"tableOfContents":[]},"featureImageSharp":{"base":"photo-1646583288948-24548aedffd8.jpg","publicURL":"/static/e6860d2b93d53230bfd7e05394764440/photo-1646583288948-24548aedffd8.jpg","imageMeta":{"width":2000,"height":1333},"childImageSharp":{"fluid":{"base64":"data:image/jpeg;base64,/9j/2wBDABALDA4MChAODQ4SERATGCgaGBYWGDEjJR0oOjM9PDkzODdASFxOQERXRTc4UG1RV19iZ2hnPk1xeXBkeFxlZ2P/2wBDARESEhgVGC8aGi9jQjhCY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2P/wgARCAANABQDASIAAhEBAxEB/8QAGAAAAwEBAAAAAAAAAAAAAAAAAAIDAQT/xAAVAQEBAAAAAAAAAAAAAAAAAAAAAf/aAAwDAQACEAMQAAABTnWKSMD/xAAYEAEBAQEBAAAAAAAAAAAAAAABAAIDEP/aAAgBAQABBQJbrlyS2lfP/8QAFBEBAAAAAAAAAAAAAAAAAAAAEP/aAAgBAwEBPwE//8QAFBEBAAAAAAAAAAAAAAAAAAAAEP/aAAgBAgEBPwE//8QAGBAAAgMAAAAAAAAAAAAAAAAAAAERIEH/2gAIAQEABj8CJe1//8QAGRAAAgMBAAAAAAAAAAAAAAAAAAEQETEh/9oACAEBAAE/IeYkoD0ZRqOP/9oADAMBAAIAAwAAABDPP//EABQRAQAAAAAAAAAAAAAAAAAAABD/2gAIAQMBAT8QP//EABQRAQAAAAAAAAAAAAAAAAAAABD/2gAIAQIBAT8QP//EABwQAQEBAAEFAAAAAAAAAAAAAAERABAhMUFRYf/aAAgBAQABPxAaswSgie+peDzmEip9blb33//Z","aspectRatio":1.4957264957264957,"src":"/static/e6860d2b93d53230bfd7e05394764440/ea4ab/photo-1646583288948-24548aedffd8.jpg","srcSet":"/static/e6860d2b93d53230bfd7e05394764440/477ba/photo-1646583288948-24548aedffd8.jpg 175w,\n/static/e6860d2b93d53230bfd7e05394764440/06776/photo-1646583288948-24548aedffd8.jpg 350w,\n/static/e6860d2b93d53230bfd7e05394764440/ea4ab/photo-1646583288948-24548aedffd8.jpg 700w,\n/static/e6860d2b93d53230bfd7e05394764440/3055e/photo-1646583288948-24548aedffd8.jpg 1050w,\n/static/e6860d2b93d53230bfd7e05394764440/eff08/photo-1646583288948-24548aedffd8.jpg 1400w,\n/static/e6860d2b93d53230bfd7e05394764440/4e5f3/photo-1646583288948-24548aedffd8.jpg 2000w","srcWebp":"/static/e6860d2b93d53230bfd7e05394764440/89afa/photo-1646583288948-24548aedffd8.webp","srcSetWebp":"/static/e6860d2b93d53230bfd7e05394764440/9fca7/photo-1646583288948-24548aedffd8.webp 175w,\n/static/e6860d2b93d53230bfd7e05394764440/37a4e/photo-1646583288948-24548aedffd8.webp 350w,\n/static/e6860d2b93d53230bfd7e05394764440/89afa/photo-1646583288948-24548aedffd8.webp 700w,\n/static/e6860d2b93d53230bfd7e05394764440/78e7a/photo-1646583288948-24548aedffd8.webp 1050w,\n/static/e6860d2b93d53230bfd7e05394764440/03d34/photo-1646583288948-24548aedffd8.webp 1400w,\n/static/e6860d2b93d53230bfd7e05394764440/49d6b/photo-1646583288948-24548aedffd8.webp 2000w","sizes":"(max-width: 700px) 100vw, 700px"}}}},"next":{"id":"Ghost__Post__63b57b53bdc6867fe35526d6","title":"Causal Machine Learning - Part 4","slug":"causal-machine-learning-part-4","featured":false,"feature_image":"https://images.unsplash.com/photo-1535378620166-273708d44e4c?crop=entropy&cs=tinysrgb&fit=max&fm=jpg&ixid=MnwxMTc3M3wwfDF8c2VhcmNofDI0fHxyb2JvdHxlbnwwfHx8fDE2NzI4Mzc3Nzg&ixlib=rb-4.0.3&q=80&w=2000","excerpt":"This post is the fourth post of the series on Causal Machine Learning. This blog post is based on the work of Judea Pearl. As always i will try to keep the things as simple as possible. So stay with me , Enjoy reading !","custom_excerpt":"This post is the fourth post of the series on Causal Machine Learning. This blog post is based on the work of Judea Pearl. As always i will try to keep the things as simple as possible. So stay with me , Enjoy reading !","visibility":"public","created_at_pretty":"4 Jan 2023","published_at_pretty":"4 Jan 2023","updated_at_pretty":"20 Feb 2023","created_at":"2023-01-04T18:42:51.000+05:30","published_at":"2023-01-04T18:44:43.000+05:30","updated_at":"2023-02-20T06:14:06.000+05:30","meta_title":null,"meta_description":null,"og_description":null,"og_image":null,"og_title":null,"twitter_description":null,"twitter_image":null,"twitter_title":null,"authors":[{"slug":"amaljith","url":"http://localhost:2368/author/amaljith/","name":"Amaljith","bio":"Research Scholar @ IIT Kharagpur","cover_image":null,"profile_image":"http://localhost:2368/content/images/2022/09/Screenshot-from-2022-09-07-18-00-00.png","location":null,"website":null,"twitter":null,"facebook":null,"meta_title":null,"meta_description":null,"coverImageSharp":null,"profileImageSharp":null}],"primary_author":{"slug":"amaljith","url":"http://localhost:2368/author/amaljith/","name":"Amaljith","bio":"Research Scholar @ IIT Kharagpur","cover_image":null,"profile_image":"http://localhost:2368/content/images/2022/09/Screenshot-from-2022-09-07-18-00-00.png","location":null,"website":null,"twitter":null,"facebook":null,"meta_title":null,"meta_description":null,"coverImageSharp":null,"profileImageSharp":{"base":"Screenshot-from-2022-09-07-18-00-00.png","publicURL":"/static/28e31bfedd96b4afe90237d2c1f700c3/Screenshot-from-2022-09-07-18-00-00.png","imageMeta":{"width":316,"height":237},"childImageSharp":{"fluid":{"base64":"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAAPCAYAAADkmO9VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAEJElEQVQ4yyXO21fTBQDA8d9fUh5LQAW1eEImbIPJbSB3kZtijjSBxNvAGSAqlJZ64GAHIQ0VjkKICCiCQ8cYsYFjYwhsYxeGjJsopmEnH76dYw+f94/w2jmKfaSX9wtTrC1a+bBk5d83dlbnzFhHehnq62TBZeLjGyfvFqf4a9nGstuAx/oM96Qai76NieFu5u0jLLrNCCsOI47nat56J1hbtLH60kTfg5tUV6hQFeSSFLGTgtwcBtUteGc0zLoGmXNpmXU8Yd6txTOlxvq8hzmr/hNhyTaC06hmya3nn6VxHKMPKS08RFF+HgWK/aiO5JP3TQ4xMgn3mqt599bC8pyBRY+OxRkdXkc/rrE+POZ+nMY+hFXHKPPWP5m3a/i4Yqax9kfCREEkyKPJSk3mQFYmOelpRISGEBMuwahv4/WC4dNuwa3l1Us906O9jGs7sWg7EHQdv6NurWVmvJv33iHKTuSyft06tm3xJygwEIlIxDb/zUhFIvw2+HDrxkX+XjUx5x7A6+pn3qXBZnyIoacFTftNhKarlTxprsMz3sPbWR2jmiYiJSICt27lqwB/tmza+L8Af5KkYsYNHXhcvbidahy2x0ya2jFqm7lxqYxrF04jvLQMYB16jKazHrOulRWHjiJFOus/+5yAjX5s8vNhs58vfj5foooO43LJMaovncI1/ZhpWxcTlk6mRjrpa7uOrqsJYcaiY25imKftv6F52IBx4D61Z46y6Yv1+PpsYLOvD36+G0gJDuRcrIS0nTvY8XUAl66osDq60A3e5sHtXzCoW3CY+hHeeKdZdL5gytDDgsOAe1zL9KiawowEsoK2URi+HWWMmKvZuzibKuNcqoTiWBEXShToLHfo7K6h+9413DYzS/MehJnJYZxjOowDHdjHnjE53M2kUU1/zVmu742lOlNOkyKRmtzd1KkUtObFcacgmUetVTT0/kTDvQpemPoxDWmoralG8NrNvJqZwmbS8Xp5FrfNhMtuxtNei75Kia3xAmtdVZgaf6axvprKvXH8kB5FXd1lFMcyuVp7Hnd9E0+bb5Cdvgfhw4oXPq5hGR5kzGzkUVcPVeWlPK8tx/BrCfcrjnI+bx+KPSmcPJxP2QklFUolRfmHyf/uEEXK49y9XMndaxdJjpQirCy4WLCPoe38A4Nez8XySjISU5CFhCIO3k5wUDASkZiUuHj2Z2ahyN5HvuIgRUeOcUZZTOlJJeUlKgbU7Txta0AYbLnFqewMrpSepvhIITm700nblYg8IpJIWRjyiAjiI6JJjJKTGBtHcnwCqYlJpCelkJOWwfe533Jw337Kik9g1N5HqC9RcfyAgvTUPcRHyYmWypCFiJGKQggNDkYsEiEJEhEmCiE8REyYWIJMKiVCEoY8fCcJUTHEx8QgCw0l/8Be/gOTiTiUD46AHAAAAABJRU5ErkJggg==","aspectRatio":1.3333333333333333,"src":"/static/28e31bfedd96b4afe90237d2c1f700c3/6ccb0/Screenshot-from-2022-09-07-18-00-00.png","srcSet":"/static/28e31bfedd96b4afe90237d2c1f700c3/7d89d/Screenshot-from-2022-09-07-18-00-00.png 28w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/f4091/Screenshot-from-2022-09-07-18-00-00.png 55w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/6ccb0/Screenshot-from-2022-09-07-18-00-00.png 110w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/30481/Screenshot-from-2022-09-07-18-00-00.png 165w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/33bd6/Screenshot-from-2022-09-07-18-00-00.png 220w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/d398b/Screenshot-from-2022-09-07-18-00-00.png 316w","srcWebp":"/static/28e31bfedd96b4afe90237d2c1f700c3/8678c/Screenshot-from-2022-09-07-18-00-00.webp","srcSetWebp":"/static/28e31bfedd96b4afe90237d2c1f700c3/59cda/Screenshot-from-2022-09-07-18-00-00.webp 28w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/7da75/Screenshot-from-2022-09-07-18-00-00.webp 55w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/8678c/Screenshot-from-2022-09-07-18-00-00.webp 110w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/f282e/Screenshot-from-2022-09-07-18-00-00.webp 165w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/a7b21/Screenshot-from-2022-09-07-18-00-00.webp 220w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/fb2b8/Screenshot-from-2022-09-07-18-00-00.webp 316w","sizes":"(max-width: 110px) 100vw, 110px"}}}},"primary_tag":{"slug":"machine-learning","url":"http://localhost:2368/tag/machine-learning/","name":"Machine Learning","visibility":"public","feature_image":null,"description":null,"meta_title":null,"meta_description":null,"featureImageSharp":null},"tags":[{"slug":"machine-learning","url":"http://localhost:2368/tag/machine-learning/","name":"Machine Learning","visibility":"public","feature_image":null,"description":null,"meta_title":null,"meta_description":null,"featureImageSharp":null},{"slug":"causal-machine-learning","url":"http://localhost:2368/tag/causal-machine-learning/","name":"Causal Machine Learning","visibility":"public","feature_image":null,"description":null,"meta_title":null,"meta_description":null,"featureImageSharp":null},{"slug":"artificial-intelligence","url":"http://localhost:2368/tag/artificial-intelligence/","name":"Artificial Intelligence","visibility":"public","feature_image":null,"description":null,"meta_title":null,"meta_description":null,"featureImageSharp":null}],"plaintext":"This post is the fourth post of the series on Causal Machine Learning. This blog post is based on the work of Judea Pearl. We have so far discussed about Causal AI , How to represent causality using SCMs , How naive statistics can fail (Spurious Correlations , Simpson Paradox & Asymmetry In Causal Inference) , Pearl's Causal Hierarchy and gave flavors of Causal Inferences & Causal Discovery etc. As always i will try to keep the things as simple as possible. So stay with me , Enjoy reading !\n\n\nHow are Causal AI models different from Bayesian networks?\n\n\nAt first glance there, there could be some ambiguities you feel while trying to explore Bayesian Networks and Causal Networks. This is completely normal. Lets try to uncover the differences before going ahead!.\n\nBayesian networks and Causal AI models appear similar. But Causal AI models capture underlying causal relationships; Bayesian networks just describe patterns of correlations.\n\nWhat Are Bayesian Networks ?\n\n\nYou remember the example we discussed earlier about the data describing people’s sleeping habits. We found that there’s a strong correlation between falling asleep with shoes on and waking up with a headache.\n\n\nA Bayesian network representing this is given below.\n\n\nThe BN tells us that both sleeping with shoes on and waking up with a migraine are correlated with drinking the night before, since there is a path between the variables. It also says that conditional on us knowing that someone was drinking the night before, knowing that they slept with their shoes on tells us absolutely nothing extra about whether they have a headache the next morning (this is called “conditional independence”). We can read off the conditional independence relationship by noticing that drinking alcohol “blocks” the pathway from shoe- sleeping to headache. BNs help to draw conclusions when more data becomes available (via “Bayes’ theorem”)\n\nWhat Are Causal Networks ?\n\n\nBNs sound useful! What’s the catch? The core problem is that Bayesian networks are blind to causality. This key, missing ingredient makes BNs very limited when it comes to more sophisticated reasoning and decision-making challenges.\n\nCausal AI is a new category of machine intelligence. Causal AI builds models that are able to capture cause- effect relationships while also retaining the benefits of BNs.\n\n\n\nMany BNs are all statistically compatible with the data, but only one BN corresponds to the genuine causal relationships in the system. As a result, it’s always left ambiguous whether your BN is a good causal model or not. And the overwhelming chances are that your BN is not a good causal model. The number of possible BNs grows exponentially as the number of features increases. With, say, 20 variables in your data, there’s effectively zero chance of randomly stumbling across the true causal model. This means you’re using a BN that’s making bad modeling decisions.\n\nHope this clarifies the differences and ambiguities between Bayesian and Causal Networks.\n\n\nCausal Inferences\n\n\nCausal inference refers to the process of drawing conclusions about the causal relationships between variables. In other words, it involves making judgments about whether changes in one variable are responsible for changes in another variable.\n\nHere are some examples of causal inference questions:\n\n 1. Does taking medication X reduce blood pressure in people with hypertension?\n 2. Does increasing the price of cigarettes lead to a decrease in smoking rates?\n 3. Does participating in an exercise program improve physical fitness?\n 4. Does attending preschool lead to better academic outcomes in school?\n 5. Does exposure to air pollution increase the risk of respiratory problems?\n 6. Did the treatment directly help those who took it?\n 7. Was it the marketing campaign that lead to increased sales this month or the holiday?\n 8. How big of an effect would increased wages have on productivity?\n\n\nThese are just a few examples, but causal inference questions can be asked in many different fields and contexts. The key is that they are trying to understand whether a particular intervention or exposure causes a change in some outcome or dependent variable.\n\n\nDo Calculus & Do Operator\n\n\nCausality - Formal Definition\n\n\nBefore going ahead , lets again define Causality in terms of interventions.\n\nIn the context of interventions, causality refers to the relationship between an intervention (also known as a treatment or exposure) and the resulting effect on an outcome or dependent variable. A causal relationship between an intervention and an outcome means that the intervention is responsible for the observed change in the outcome. In other words, if we change the intervention, we expect to see a corresponding change in the outcome.\n\nFor example, if a medication is found to reduce blood pressure in people with hypertension, we can say that there is a causal relationship between taking the medication and lowering blood pressure. This is because we expect that if we give the medication to a group of people with hypertension, their blood pressure will decrease as a result of the intervention. However, it is important to note that there may be other factors that could have influenced the relationship between the intervention and the outcome.\n\nWhat is Do-Calculus ?\n\n\nIn the context of causal AI, do-calculus can be used to reason about the effects of interventions on a causal model. In causal AI, a causal model represents the relationships between different variables in a system, and the do-calculus can be used to reason about how changing the value of one variable (the \"intervention\") will affect the values of other variables in the system. This can be useful for understanding the potential consequences of interventions in a real-world system, or for identifying the most effective intervention to achieve a particular outcome.\n\nHere is Judea Pearl’s canonical primer on do-calculus—a short PDF with lots of math and proofs (Pearl 2012).\n\nWhat do-operator does ?\n\n\nHowever, how does that fit into causality’s mathematical representation?\n\nThe do-operator is a mathematical representation of a physical intervention.\n\nIf we start with the model Z → X → Y, we can simulate an intervention in X by deleting all the incoming arrows to X, and manually setting X to some value x0\n\n\nBelow is the illustration how do-operator works.\n\nRules Of Do Calculus\n\n\n\n\nBeneath this scary math, each rule has specific intuition and purpose behind it! Here’s what each rule actually does:\n\nRule 1: Decide if we can ignore an observation\nRule 2: Decide if we can treat an intervention as an observation\nRule 3: Decide if we can ignore an intervention\n\n\n\nWhoa! That’s exceptionally logical. Each rule is designed to help simplify and reduce nodes in a DAG by either ignoring them (Rules 1 and 3) or making it so interventions like do⁡(⋅)do(⋅) can be treated like observations instead (Rule 2).\n\n\nReferences\n\n\n 1. Judea Pearl’s canonical primer on do-calculus\n","html":"<p>This post is the fourth post of the series on <a href=\"http://localhost:2368/tag/causal-machine-learning\">Causal Machine Learning</a>. This blog post is based on the work of Judea Pearl. We have so far discussed about Causal AI , How to represent causality using SCMs , How naive statistics can fail (Spurious Correlations , Simpson Paradox &amp; Asymmetry In Causal Inference) , Pearl's Causal Hierarchy and gave flavors of Causal Inferences &amp; Causal Discovery etc. As always i will try to keep the things as simple as possible. So stay with me , Enjoy reading !</p><!--kg-card-begin: markdown--><h3 id=\"how-are-causal-ai-models-different-from-bayesian-networks\">How are Causal AI models different from Bayesian networks?</h3>\n<!--kg-card-end: markdown--><p>At first glance there, there could be some ambiguities you feel while trying to explore Bayesian Networks and Causal Networks. This is completely normal. Lets try to uncover the differences before going ahead!. </p><blockquote>Bayesian networks and Causal AI models appear similar. But Causal AI models capture underlying causal relationships; Bayesian networks just describe patterns of correlations. </blockquote><!--kg-card-begin: markdown--><h4 id=\"what-are-bayesian-networks\">What Are Bayesian Networks ?</h4>\n<!--kg-card-end: markdown--><p>You remember the example we discussed earlier about the data describing people’s sleeping habits. We found that there’s a strong correlation between falling asleep with shoes on and waking up with a headache. </p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/211162715-e934cca9-50aa-4d08-a259-c04c4112fa11.png\" align=\"center\" alt=\"Headache and Sleeping Shoes On\" width=\"665\" height=\"239\"/>\n<!--kg-card-end: markdown--><p>A Bayesian network representing this is given below.</p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/211162842-a197dc73-3cd8-4597-a12b-d93ad49fad82.png\" align=\"center\" alt=\"Bayesian Network Representation\" width=\"527\" height=\"453\"/>\n<!--kg-card-end: markdown--><p>The BN tells us that both sleeping with shoes on and waking up with a migraine are correlated with drinking the night before, since there is a path between the variables. It also says that conditional on us knowing that someone was drinking the night before, knowing that they slept with their shoes on tells us absolutely nothing extra about whether they have a headache the next morning (this is called “conditional independence”). We can read off the conditional independence relationship by noticing that drinking alcohol “blocks” the pathway from shoe- sleeping to headache. BNs help to draw conclusions when more data becomes available (via “<a href=\"https://www.youtube.com/watch?v=HZGCoVF3YvM\" rel=\"noreferrer noopener\">Bayes’ theorem</a>”)</p><!--kg-card-begin: markdown--><h4 id=\"what-are-causal-networks\">What Are Causal Networks ?</h4>\n<!--kg-card-end: markdown--><p>BNs sound useful! What’s the catch? The core problem is that <strong>Bayesian networks are blind to causality</strong>. This key, missing ingredient makes BNs very limited when it comes to more sophisticated reasoning and decision-making challenges.</p><p>Causal AI is a new category of machine intelligence. <strong>Causal AI builds models that are able to capture cause- effect relationships </strong>while also retaining the benefits of BNs. </p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/211164237-265f8d20-5b8c-4a31-95cc-897989888817.png\" align=\"center\" alt=\"Bayesian Network Vs Causal AI Model\" width=\"359\" height=\"219\"/>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/211164446-257a87fc-975c-4cef-accc-1fac10ba489f.png\" align=\"center\" alt=\"Bayesian Vs Causal\" width=\"720\" height=\"409\"/>\n<!--kg-card-end: markdown--><p>Many BNs are all statistically compatible with the data, but only one BN corresponds to the genuine causal relationships in the system. As a result, it’s always left ambiguous whether your BN is a good causal model or not. And the overwhelming chances are that your BN is not a good causal model. The number of possible BNs grows exponentially as the number of features increases. With, say, 20 variables in your data, there’s effectively zero chance of randomly stumbling across the true causal model. This means you’re using a BN that’s making bad modeling decisions.</p><p>Hope this clarifies the differences and ambiguities between Bayesian and Causal Networks.</p><!--kg-card-begin: markdown--><h3 id=\"causal-inferences\">Causal Inferences</h3>\n<!--kg-card-end: markdown--><p>Causal inference refers to the process of drawing conclusions about the causal relationships between variables. In other words, it involves making judgments about whether changes in one variable are responsible for changes in another variable.</p><p>Here are some examples of causal inference questions:</p><!--kg-card-begin: markdown--><ol>\n<li>Does taking medication X reduce blood pressure in people with hypertension?</li>\n<li>Does increasing the price of cigarettes lead to a decrease in smoking rates?</li>\n<li>Does participating in an exercise program improve physical fitness?</li>\n<li>Does attending preschool lead to better academic outcomes in school?</li>\n<li>Does exposure to air pollution increase the risk of respiratory problems?</li>\n<li>Did the treatment directly help those who took it?</li>\n<li>Was it the marketing campaign that lead to increased sales this month or the holiday?</li>\n<li>How big of an effect would increased wages have on productivity?</li>\n</ol>\n<!--kg-card-end: markdown--><p>These are just a few examples, but causal inference questions can be asked in many different fields and contexts. The key is that they are trying to understand whether a particular intervention or exposure causes a change in some outcome or dependent variable.</p><!--kg-card-begin: markdown--><h3 id=\"do-calculus-do-operator\">Do Calculus &amp; Do Operator</h3>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><h4 id=\"causalityformal-definition\">Causality - Formal Definition</h4>\n<!--kg-card-end: markdown--><p>Before going ahead , lets again define Causality in terms of interventions.</p><blockquote>In the context of interventions, causality refers to the relationship between an intervention (also known as a treatment or exposure) and the resulting effect on an outcome or dependent variable. A causal relationship between an intervention and an outcome means that the intervention is responsible for the observed change in the outcome. In other words, if we change the intervention, we expect to see a corresponding change in the outcome.</blockquote><p>For example, if a medication is found to reduce blood pressure in people with hypertension, we can say that there is a causal relationship between taking the medication and lowering blood pressure. This is because we expect that if we give the medication to a group of people with hypertension, their blood pressure will decrease as a result of the intervention. However, it is important to note that there may be other factors that could have influenced the relationship between the intervention and the outcome.</p><!--kg-card-begin: markdown--><h4 id=\"what-is-do-calculus\">What is Do-Calculus ?</h4>\n<!--kg-card-end: markdown--><p>In the context of causal AI, do-calculus can be used to reason about the effects of interventions on a causal model. In causal AI, a causal model represents the relationships between different variables in a system, and the do-calculus can be used to reason about how changing the value of one variable (the \"intervention\") will affect the values of other variables in the system. This can be useful for understanding the potential consequences of interventions in a real-world system, or for identifying the most effective intervention to achieve a particular outcome.</p><p>H<a href=\"https://ftp.cs.ucla.edu/pub/stat_ser/r402.pdf\">ere is Judea Pearl’s canonical primer on <em>do</em>-calculus</a>—a short PDF with lots of math and proofs (<a href=\"https://www.andrewheiss.com/blog/2021/09/07/do-calculus-backdoors/#ref-Pearl:2012\">Pearl 2012</a>).</p><!--kg-card-begin: markdown--><h4 id=\"what-do-operator-does\">What do-operator does ?</h4>\n<!--kg-card-end: markdown--><blockquote><em>However, how does that fit into causality’s mathematical representation?</em></blockquote><p>The <em>do-operator</em> is a mathematical representation of a physical intervention.</p><!--kg-card-begin: markdown--><p>If we start with the model Z → X → Y, we can simulate an intervention in X by deleting all the incoming arrows to X, and manually setting X to some value x<sub>0</sub></p>\n<!--kg-card-end: markdown--><p>Below is the illustration how do-operator works. </p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/211196318-0f07a704-ff54-4eff-82a7-d2b31913406c.png\" align=\"center\" alt=\"Illustration Of Working Of Do-Operator\" width=\"649\" height=\"593\"/><!--kg-card-end: markdown--><!--kg-card-begin: markdown--><h4 id=\"rules-of-do-calculus\">Rules Of Do Calculus</h4>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/211195804-e99bff5b-b622-4184-9b21-99e96e9c9495.png\" align=\"center\" alt=\"Do Calculus Rules\" width=\"720\" height=\"405\"/><!--kg-card-end: markdown--><p></p><p>Beneath this scary math, each rule has specific intuition and purpose behind it! Here’s what each rule actually does:</p><!--kg-card-begin: markdown--><pre><code>Rule 1: Decide if we can ignore an observation\nRule 2: Decide if we can treat an intervention as an observation\nRule 3: Decide if we can ignore an intervention\n</code></pre>\n<!--kg-card-end: markdown--><p>Whoa! That’s exceptionally logical. Each rule is designed to help simplify and reduce nodes in a DAG by either ignoring them (Rules 1 and 3) or making it so interventions like do⁡(⋅)do(⋅) can be treated like observations instead (Rule 2).</p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/213162724-cbc40a1b-71eb-42c0-bb7e-c6f0f4f5dc27.jpg\" align=\"center\" alt=\"Do Calculus Rules\" width=\"640\" height=\"673\"/><!--kg-card-end: markdown--><!--kg-card-begin: markdown--><h3 id=\"references\">References</h3>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><ol>\n<li><a href=\"https://ftp.cs.ucla.edu/pub/stat_ser/r402.pdf\">Judea Pearl’s canonical primer on do-calculus</a></li>\n</ol>\n<!--kg-card-end: markdown-->","url":"http://localhost:2368/causal-machine-learning-part-4/","canonical_url":null,"uuid":"183df7a0-e091-4af9-94bf-e542c0bed9a2","codeinjection_foot":null,"codeinjection_head":null,"codeinjection_styles":null,"comment_id":"63b57b53bdc6867fe35526d6","reading_time":5,"send_email_when_published":null,"email_subject":null,"childHtmlRehype":{"html":"<p>This post is the fourth post of the series on <a href=\"/tag/causal-machine-learning\">Causal Machine Learning</a>. This blog post is based on the work of Judea Pearl. We have so far discussed about Causal AI , How to represent causality using SCMs , How naive statistics can fail (Spurious Correlations , Simpson Paradox &#x26; Asymmetry In Causal Inference) , Pearl's Causal Hierarchy and gave flavors of Causal Inferences &#x26; Causal Discovery etc. As always i will try to keep the things as simple as possible. So stay with me , Enjoy reading !</p><!--kg-card-begin: markdown--><h3 id=\"how-are-causal-ai-models-different-from-bayesian-networks\">How are Causal AI models different from Bayesian networks?</h3>\n<!--kg-card-end: markdown--><p>At first glance there, there could be some ambiguities you feel while trying to explore Bayesian Networks and Causal Networks. This is completely normal. Lets try to uncover the differences before going ahead!. </p><blockquote>Bayesian networks and Causal AI models appear similar. But Causal AI models capture underlying causal relationships; Bayesian networks just describe patterns of correlations. </blockquote><!--kg-card-begin: markdown--><h4 id=\"what-are-bayesian-networks\">What Are Bayesian Networks ?</h4>\n<!--kg-card-end: markdown--><p>You remember the example we discussed earlier about the data describing people’s sleeping habits. We found that there’s a strong correlation between falling asleep with shoes on and waking up with a headache. </p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/211162715-e934cca9-50aa-4d08-a259-c04c4112fa11.png\" align=\"center\" alt=\"Headache and Sleeping Shoes On\" width=\"665\" height=\"239\">\n<!--kg-card-end: markdown--><p>A Bayesian network representing this is given below.</p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/211162842-a197dc73-3cd8-4597-a12b-d93ad49fad82.png\" align=\"center\" alt=\"Bayesian Network Representation\" width=\"527\" height=\"453\">\n<!--kg-card-end: markdown--><p>The BN tells us that both sleeping with shoes on and waking up with a migraine are correlated with drinking the night before, since there is a path between the variables. It also says that conditional on us knowing that someone was drinking the night before, knowing that they slept with their shoes on tells us absolutely nothing extra about whether they have a headache the next morning (this is called “conditional independence”). We can read off the conditional independence relationship by noticing that drinking alcohol “blocks” the pathway from shoe- sleeping to headache. BNs help to draw conclusions when more data becomes available (via “<a href=\"https://www.youtube.com/watch?v=HZGCoVF3YvM\" rel=\"noreferrer noopener\">Bayes’ theorem</a>”)</p><!--kg-card-begin: markdown--><h4 id=\"what-are-causal-networks\">What Are Causal Networks ?</h4>\n<!--kg-card-end: markdown--><p>BNs sound useful! What’s the catch? The core problem is that <strong>Bayesian networks are blind to causality</strong>. This key, missing ingredient makes BNs very limited when it comes to more sophisticated reasoning and decision-making challenges.</p><p>Causal AI is a new category of machine intelligence. <strong>Causal AI builds models that are able to capture cause- effect relationships </strong>while also retaining the benefits of BNs. </p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/211164237-265f8d20-5b8c-4a31-95cc-897989888817.png\" align=\"center\" alt=\"Bayesian Network Vs Causal AI Model\" width=\"359\" height=\"219\">\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/211164446-257a87fc-975c-4cef-accc-1fac10ba489f.png\" align=\"center\" alt=\"Bayesian Vs Causal\" width=\"720\" height=\"409\">\n<!--kg-card-end: markdown--><p>Many BNs are all statistically compatible with the data, but only one BN corresponds to the genuine causal relationships in the system. As a result, it’s always left ambiguous whether your BN is a good causal model or not. And the overwhelming chances are that your BN is not a good causal model. The number of possible BNs grows exponentially as the number of features increases. With, say, 20 variables in your data, there’s effectively zero chance of randomly stumbling across the true causal model. This means you’re using a BN that’s making bad modeling decisions.</p><p>Hope this clarifies the differences and ambiguities between Bayesian and Causal Networks.</p><!--kg-card-begin: markdown--><h3 id=\"causal-inferences\">Causal Inferences</h3>\n<!--kg-card-end: markdown--><p>Causal inference refers to the process of drawing conclusions about the causal relationships between variables. In other words, it involves making judgments about whether changes in one variable are responsible for changes in another variable.</p><p>Here are some examples of causal inference questions:</p><!--kg-card-begin: markdown--><ol>\n<li>Does taking medication X reduce blood pressure in people with hypertension?</li>\n<li>Does increasing the price of cigarettes lead to a decrease in smoking rates?</li>\n<li>Does participating in an exercise program improve physical fitness?</li>\n<li>Does attending preschool lead to better academic outcomes in school?</li>\n<li>Does exposure to air pollution increase the risk of respiratory problems?</li>\n<li>Did the treatment directly help those who took it?</li>\n<li>Was it the marketing campaign that lead to increased sales this month or the holiday?</li>\n<li>How big of an effect would increased wages have on productivity?</li>\n</ol>\n<!--kg-card-end: markdown--><p>These are just a few examples, but causal inference questions can be asked in many different fields and contexts. The key is that they are trying to understand whether a particular intervention or exposure causes a change in some outcome or dependent variable.</p><!--kg-card-begin: markdown--><h3 id=\"do-calculus-do-operator\">Do Calculus &#x26; Do Operator</h3>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><h4 id=\"causalityformal-definition\">Causality - Formal Definition</h4>\n<!--kg-card-end: markdown--><p>Before going ahead , lets again define Causality in terms of interventions.</p><blockquote>In the context of interventions, causality refers to the relationship between an intervention (also known as a treatment or exposure) and the resulting effect on an outcome or dependent variable. A causal relationship between an intervention and an outcome means that the intervention is responsible for the observed change in the outcome. In other words, if we change the intervention, we expect to see a corresponding change in the outcome.</blockquote><p>For example, if a medication is found to reduce blood pressure in people with hypertension, we can say that there is a causal relationship between taking the medication and lowering blood pressure. This is because we expect that if we give the medication to a group of people with hypertension, their blood pressure will decrease as a result of the intervention. However, it is important to note that there may be other factors that could have influenced the relationship between the intervention and the outcome.</p><!--kg-card-begin: markdown--><h4 id=\"what-is-do-calculus\">What is Do-Calculus ?</h4>\n<!--kg-card-end: markdown--><p>In the context of causal AI, do-calculus can be used to reason about the effects of interventions on a causal model. In causal AI, a causal model represents the relationships between different variables in a system, and the do-calculus can be used to reason about how changing the value of one variable (the \"intervention\") will affect the values of other variables in the system. This can be useful for understanding the potential consequences of interventions in a real-world system, or for identifying the most effective intervention to achieve a particular outcome.</p><p>H<a href=\"https://ftp.cs.ucla.edu/pub/stat_ser/r402.pdf\">ere is Judea Pearl’s canonical primer on <em>do</em>-calculus</a>—a short PDF with lots of math and proofs (<a href=\"https://www.andrewheiss.com/blog/2021/09/07/do-calculus-backdoors/#ref-Pearl:2012\">Pearl 2012</a>).</p><!--kg-card-begin: markdown--><h4 id=\"what-do-operator-does\">What do-operator does ?</h4>\n<!--kg-card-end: markdown--><blockquote><em>However, how does that fit into causality’s mathematical representation?</em></blockquote><p>The <em>do-operator</em> is a mathematical representation of a physical intervention.</p><!--kg-card-begin: markdown--><p>If we start with the model Z → X → Y, we can simulate an intervention in X by deleting all the incoming arrows to X, and manually setting X to some value x<sub>0</sub></p>\n<!--kg-card-end: markdown--><p>Below is the illustration how do-operator works. </p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/211196318-0f07a704-ff54-4eff-82a7-d2b31913406c.png\" align=\"center\" alt=\"Illustration Of Working Of Do-Operator\" width=\"649\" height=\"593\"><!--kg-card-end: markdown--><!--kg-card-begin: markdown--><h4 id=\"rules-of-do-calculus\">Rules Of Do Calculus</h4>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/211195804-e99bff5b-b622-4184-9b21-99e96e9c9495.png\" align=\"center\" alt=\"Do Calculus Rules\" width=\"720\" height=\"405\"><!--kg-card-end: markdown--><p></p><p>Beneath this scary math, each rule has specific intuition and purpose behind it! Here’s what each rule actually does:</p><!--kg-card-begin: markdown--><div class=\"kg-card kg-code-card gatsby-highlight\" data-language=\"text\"><pre class=\"language-text\"><code class=\"language-text\">Rule 1: Decide if we can ignore an observation\nRule 2: Decide if we can treat an intervention as an observation\nRule 3: Decide if we can ignore an intervention\n</code></pre></div>\n<!--kg-card-end: markdown--><p>Whoa! That’s exceptionally logical. Each rule is designed to help simplify and reduce nodes in a DAG by either ignoring them (Rules 1 and 3) or making it so interventions like do⁡(⋅)do(⋅) can be treated like observations instead (Rule 2).</p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/213162724-cbc40a1b-71eb-42c0-bb7e-c6f0f4f5dc27.jpg\" align=\"center\" alt=\"Do Calculus Rules\" width=\"640\" height=\"673\"><!--kg-card-end: markdown--><!--kg-card-begin: markdown--><h3 id=\"references\">References</h3>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><ol>\n<li><a href=\"https://ftp.cs.ucla.edu/pub/stat_ser/r402.pdf\">Judea Pearl’s canonical primer on do-calculus</a></li>\n</ol>\n<!--kg-card-end: markdown-->","htmlAst":{"type":"root","children":[{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"This post is the fourth post of the series on "},{"type":"element","tagName":"a","properties":{"href":"/tag/causal-machine-learning"},"children":[{"type":"text","value":"Causal Machine Learning"}]},{"type":"text","value":". This blog post is based on the work of Judea Pearl. We have so far discussed about Causal AI , How to represent causality using SCMs , How naive statistics can fail (Spurious Correlations , Simpson Paradox & Asymmetry In Causal Inference) , Pearl's Causal Hierarchy and gave flavors of Causal Inferences & Causal Discovery etc. As always i will try to keep the things as simple as possible. So stay with me , Enjoy reading !"}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h3","properties":{"id":"how-are-causal-ai-models-different-from-bayesian-networks"},"children":[{"type":"text","value":"How are Causal AI models different from Bayesian networks?"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"At first glance there, there could be some ambiguities you feel while trying to explore Bayesian Networks and Causal Networks. This is completely normal. Lets try to uncover the differences before going ahead!. "}]},{"type":"element","tagName":"blockquote","properties":{},"children":[{"type":"text","value":"Bayesian networks and Causal AI models appear similar. But Causal AI models capture underlying causal relationships; Bayesian networks just describe patterns of correlations. "}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h4","properties":{"id":"what-are-bayesian-networks"},"children":[{"type":"text","value":"What Are Bayesian Networks ?"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"You remember the example we discussed earlier about the data describing people’s sleeping habits. We found that there’s a strong correlation between falling asleep with shoes on and waking up with a headache. "}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/211162715-e934cca9-50aa-4d08-a259-c04c4112fa11.png","align":"center","alt":"Headache and Sleeping Shoes On","width":665,"height":239},"children":[]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"A Bayesian network representing this is given below."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/211162842-a197dc73-3cd8-4597-a12b-d93ad49fad82.png","align":"center","alt":"Bayesian Network Representation","width":527,"height":453},"children":[]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"The BN tells us that both sleeping with shoes on and waking up with a migraine are correlated with drinking the night before, since there is a path between the variables. It also says that conditional on us knowing that someone was drinking the night before, knowing that they slept with their shoes on tells us absolutely nothing extra about whether they have a headache the next morning (this is called “conditional independence”). We can read off the conditional independence relationship by noticing that drinking alcohol “blocks” the pathway from shoe- sleeping to headache. BNs help to draw conclusions when more data becomes available (via “"},{"type":"element","tagName":"a","properties":{"href":"https://www.youtube.com/watch?v=HZGCoVF3YvM","rel":["noreferrer","noopener"]},"children":[{"type":"text","value":"Bayes’ theorem"}]},{"type":"text","value":"”)"}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h4","properties":{"id":"what-are-causal-networks"},"children":[{"type":"text","value":"What Are Causal Networks ?"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"BNs sound useful! What’s the catch? The core problem is that "},{"type":"element","tagName":"strong","properties":{},"children":[{"type":"text","value":"Bayesian networks are blind to causality"}]},{"type":"text","value":". This key, missing ingredient makes BNs very limited when it comes to more sophisticated reasoning and decision-making challenges."}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Causal AI is a new category of machine intelligence. "},{"type":"element","tagName":"strong","properties":{},"children":[{"type":"text","value":"Causal AI builds models that are able to capture cause- effect relationships "}]},{"type":"text","value":"while also retaining the benefits of BNs. "}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/211164237-265f8d20-5b8c-4a31-95cc-897989888817.png","align":"center","alt":"Bayesian Network Vs Causal AI Model","width":359,"height":219},"children":[]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/211164446-257a87fc-975c-4cef-accc-1fac10ba489f.png","align":"center","alt":"Bayesian Vs Causal","width":720,"height":409},"children":[]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Many BNs are all statistically compatible with the data, but only one BN corresponds to the genuine causal relationships in the system. As a result, it’s always left ambiguous whether your BN is a good causal model or not. And the overwhelming chances are that your BN is not a good causal model. The number of possible BNs grows exponentially as the number of features increases. With, say, 20 variables in your data, there’s effectively zero chance of randomly stumbling across the true causal model. This means you’re using a BN that’s making bad modeling decisions."}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Hope this clarifies the differences and ambiguities between Bayesian and Causal Networks."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h3","properties":{"id":"causal-inferences"},"children":[{"type":"text","value":"Causal Inferences"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Causal inference refers to the process of drawing conclusions about the causal relationships between variables. In other words, it involves making judgments about whether changes in one variable are responsible for changes in another variable."}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Here are some examples of causal inference questions:"}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"ol","properties":{},"children":[{"type":"text","value":"\n"},{"type":"element","tagName":"li","properties":{},"children":[{"type":"text","value":"Does taking medication X reduce blood pressure in people with hypertension?"}]},{"type":"text","value":"\n"},{"type":"element","tagName":"li","properties":{},"children":[{"type":"text","value":"Does increasing the price of cigarettes lead to a decrease in smoking rates?"}]},{"type":"text","value":"\n"},{"type":"element","tagName":"li","properties":{},"children":[{"type":"text","value":"Does participating in an exercise program improve physical fitness?"}]},{"type":"text","value":"\n"},{"type":"element","tagName":"li","properties":{},"children":[{"type":"text","value":"Does attending preschool lead to better academic outcomes in school?"}]},{"type":"text","value":"\n"},{"type":"element","tagName":"li","properties":{},"children":[{"type":"text","value":"Does exposure to air pollution increase the risk of respiratory problems?"}]},{"type":"text","value":"\n"},{"type":"element","tagName":"li","properties":{},"children":[{"type":"text","value":"Did the treatment directly help those who took it?"}]},{"type":"text","value":"\n"},{"type":"element","tagName":"li","properties":{},"children":[{"type":"text","value":"Was it the marketing campaign that lead to increased sales this month or the holiday?"}]},{"type":"text","value":"\n"},{"type":"element","tagName":"li","properties":{},"children":[{"type":"text","value":"How big of an effect would increased wages have on productivity?"}]},{"type":"text","value":"\n"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"These are just a few examples, but causal inference questions can be asked in many different fields and contexts. The key is that they are trying to understand whether a particular intervention or exposure causes a change in some outcome or dependent variable."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h3","properties":{"id":"do-calculus-do-operator"},"children":[{"type":"text","value":"Do Calculus & Do Operator"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h4","properties":{"id":"causalityformal-definition"},"children":[{"type":"text","value":"Causality - Formal Definition"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Before going ahead , lets again define Causality in terms of interventions."}]},{"type":"element","tagName":"blockquote","properties":{},"children":[{"type":"text","value":"In the context of interventions, causality refers to the relationship between an intervention (also known as a treatment or exposure) and the resulting effect on an outcome or dependent variable. A causal relationship between an intervention and an outcome means that the intervention is responsible for the observed change in the outcome. In other words, if we change the intervention, we expect to see a corresponding change in the outcome."}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"For example, if a medication is found to reduce blood pressure in people with hypertension, we can say that there is a causal relationship between taking the medication and lowering blood pressure. This is because we expect that if we give the medication to a group of people with hypertension, their blood pressure will decrease as a result of the intervention. However, it is important to note that there may be other factors that could have influenced the relationship between the intervention and the outcome."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h4","properties":{"id":"what-is-do-calculus"},"children":[{"type":"text","value":"What is Do-Calculus ?"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"In the context of causal AI, do-calculus can be used to reason about the effects of interventions on a causal model. In causal AI, a causal model represents the relationships between different variables in a system, and the do-calculus can be used to reason about how changing the value of one variable (the \"intervention\") will affect the values of other variables in the system. This can be useful for understanding the potential consequences of interventions in a real-world system, or for identifying the most effective intervention to achieve a particular outcome."}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"H"},{"type":"element","tagName":"a","properties":{"href":"https://ftp.cs.ucla.edu/pub/stat_ser/r402.pdf"},"children":[{"type":"text","value":"ere is Judea Pearl’s canonical primer on "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"do"}]},{"type":"text","value":"-calculus"}]},{"type":"text","value":"—a short PDF with lots of math and proofs ("},{"type":"element","tagName":"a","properties":{"href":"https://www.andrewheiss.com/blog/2021/09/07/do-calculus-backdoors/#ref-Pearl:2012"},"children":[{"type":"text","value":"Pearl 2012"}]},{"type":"text","value":")."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h4","properties":{"id":"what-do-operator-does"},"children":[{"type":"text","value":"What do-operator does ?"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"blockquote","properties":{},"children":[{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"However, how does that fit into causality’s mathematical representation?"}]}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"The "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"do-operator"}]},{"type":"text","value":" is a mathematical representation of a physical intervention."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"If we start with the model Z → X → Y, we can simulate an intervention in X by deleting all the incoming arrows to X, and manually setting X to some value x"},{"type":"element","tagName":"sub","properties":{},"children":[{"type":"text","value":"0"}]}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Below is the illustration how do-operator works. "}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/211196318-0f07a704-ff54-4eff-82a7-d2b31913406c.png","align":"center","alt":"Illustration Of Working Of Do-Operator","width":649,"height":593},"children":[]},{"type":"comment","value":"kg-card-end: markdown"},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h4","properties":{"id":"rules-of-do-calculus"},"children":[{"type":"text","value":"Rules Of Do Calculus"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/211195804-e99bff5b-b622-4184-9b21-99e96e9c9495.png","align":"center","alt":"Do Calculus Rules","width":720,"height":405},"children":[]},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Beneath this scary math, each rule has specific intuition and purpose behind it! Here’s what each rule actually does:"}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"div","properties":{"className":["kg-card","kg-code-card","gatsby-highlight"],"dataLanguage":"text"},"children":[{"type":"element","tagName":"pre","properties":{"className":["language-text"]},"children":[{"type":"element","tagName":"code","properties":{"className":["language-text"]},"children":[{"type":"text","value":"Rule 1: Decide if we can ignore an observation\nRule 2: Decide if we can treat an intervention as an observation\nRule 3: Decide if we can ignore an intervention\n"}]}]}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Whoa! That’s exceptionally logical. Each rule is designed to help simplify and reduce nodes in a DAG by either ignoring them (Rules 1 and 3) or making it so interventions like do⁡(⋅)do(⋅) can be treated like observations instead (Rule 2)."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/213162724-cbc40a1b-71eb-42c0-bb7e-c6f0f4f5dc27.jpg","align":"center","alt":"Do Calculus Rules","width":640,"height":673},"children":[]},{"type":"comment","value":"kg-card-end: markdown"},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h3","properties":{"id":"references"},"children":[{"type":"text","value":"References"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"ol","properties":{},"children":[{"type":"text","value":"\n"},{"type":"element","tagName":"li","properties":{},"children":[{"type":"element","tagName":"a","properties":{"href":"https://ftp.cs.ucla.edu/pub/stat_ser/r402.pdf"},"children":[{"type":"text","value":"Judea Pearl’s canonical primer on do-calculus"}]}]},{"type":"text","value":"\n"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"}],"data":{"quirksMode":false}},"tableOfContents":[{"id":"how-are-causal-ai-models-different-from-bayesian-networks","heading":"How are Causal AI models different from Bayesian networks?","items":[{"id":"what-are-bayesian-networks","heading":"What Are Bayesian Networks ?"},{"id":"what-are-causal-networks","heading":"What Are Causal Networks ?"}]},{"id":"causal-inferences","heading":"Causal Inferences"},{"id":"do-calculus-do-operator","heading":"Do Calculus & Do Operator","items":[{"id":"causalityformal-definition","heading":"Causality - Formal Definition"},{"id":"what-is-do-calculus","heading":"What is Do-Calculus ?"},{"id":"what-do-operator-does","heading":"What do-operator does ?"},{"id":"rules-of-do-calculus","heading":"Rules Of Do Calculus"}]},{"id":"references","heading":"References"}]},"featureImageSharp":{"base":"photo-1535378620166-273708d44e4c.jpg","publicURL":"/static/15ece0236a87dee6e33bbaabd76260fb/photo-1535378620166-273708d44e4c.jpg","imageMeta":{"width":2000,"height":1768},"childImageSharp":{"fluid":{"base64":"data:image/jpeg;base64,/9j/2wBDABALDA4MChAODQ4SERATGCgaGBYWGDEjJR0oOjM9PDkzODdASFxOQERXRTc4UG1RV19iZ2hnPk1xeXBkeFxlZ2P/2wBDARESEhgVGC8aGi9jQjhCY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2P/wgARCAASABQDASIAAhEBAxEB/8QAGAABAQEBAQAAAAAAAAAAAAAAAAMFAgT/xAAVAQEBAAAAAAAAAAAAAAAAAAAAAf/aAAwDAQACEAMQAAAByK29cYzpVqkRFf/EABwQAQABBAMAAAAAAAAAAAAAAAIBABIhMgMRI//aAAgBAQABBQIxmbZNcVvXkSsKN3o6/8QAFBEBAAAAAAAAAAAAAAAAAAAAIP/aAAgBAwEBPwEf/8QAFBEBAAAAAAAAAAAAAAAAAAAAIP/aAAgBAgEBPwEf/8QAHBAAAQMFAAAAAAAAAAAAAAAAEQABIQIQIDJx/9oACAEBAAY/AlDAWFTStT3H/8QAHRAAAgIBBQAAAAAAAAAAAAAAAAERMSFBUXGRsf/aAAgBAQABPyFTyFlT7ZmRojyLa7DTtDsLIujzLiy4P//aAAwDAQACAAMAAAAQ28fA/8QAFBEBAAAAAAAAAAAAAAAAAAAAIP/aAAgBAwEBPxAf/8QAFBEBAAAAAAAAAAAAAAAAAAAAIP/aAAgBAgEBPxAf/8QAHRABAAICAwEBAAAAAAAAAAAAAQARITFBUWGBsf/aAAgBAQABPxAUYFK/C5ZQI4VmsXMkuwZYtjyBFqlNh8qAXq7t1xE3ywnf1+wC6W13P//Z","aspectRatio":1.1290322580645162,"src":"/static/15ece0236a87dee6e33bbaabd76260fb/ea4ab/photo-1535378620166-273708d44e4c.jpg","srcSet":"/static/15ece0236a87dee6e33bbaabd76260fb/477ba/photo-1535378620166-273708d44e4c.jpg 175w,\n/static/15ece0236a87dee6e33bbaabd76260fb/06776/photo-1535378620166-273708d44e4c.jpg 350w,\n/static/15ece0236a87dee6e33bbaabd76260fb/ea4ab/photo-1535378620166-273708d44e4c.jpg 700w,\n/static/15ece0236a87dee6e33bbaabd76260fb/3055e/photo-1535378620166-273708d44e4c.jpg 1050w,\n/static/15ece0236a87dee6e33bbaabd76260fb/eff08/photo-1535378620166-273708d44e4c.jpg 1400w,\n/static/15ece0236a87dee6e33bbaabd76260fb/4e5f3/photo-1535378620166-273708d44e4c.jpg 2000w","srcWebp":"/static/15ece0236a87dee6e33bbaabd76260fb/89afa/photo-1535378620166-273708d44e4c.webp","srcSetWebp":"/static/15ece0236a87dee6e33bbaabd76260fb/9fca7/photo-1535378620166-273708d44e4c.webp 175w,\n/static/15ece0236a87dee6e33bbaabd76260fb/37a4e/photo-1535378620166-273708d44e4c.webp 350w,\n/static/15ece0236a87dee6e33bbaabd76260fb/89afa/photo-1535378620166-273708d44e4c.webp 700w,\n/static/15ece0236a87dee6e33bbaabd76260fb/78e7a/photo-1535378620166-273708d44e4c.webp 1050w,\n/static/15ece0236a87dee6e33bbaabd76260fb/03d34/photo-1535378620166-273708d44e4c.webp 1400w,\n/static/15ece0236a87dee6e33bbaabd76260fb/49d6b/photo-1535378620166-273708d44e4c.webp 2000w","sizes":"(max-width: 700px) 100vw, 700px"}}}},"allGhostPost":{"edges":[{"node":{"id":"Ghost__Post__63aaf0f3bdc6867fe35525cb","title":"Causal Machine Learning - Part 2","slug":"causal-machine-learning-part-2","featured":true,"feature_image":"https://images.unsplash.com/photo-1642341438078-af255e0f08a3?crop=entropy&cs=tinysrgb&fit=max&fm=jpg&ixid=MnwxMTc3M3wwfDF8c2VhcmNofDc3fHxyb2JvdCUyMHRoaW5raW5ofGVufDB8fHx8MTY3NDA0MzA5MA&ixlib=rb-4.0.3&q=80&w=2000","excerpt":"This post is the second post of the series on Causal Machine Learning.  This blog post is based on the work of Judea Pearl. I will discuss how naive statistics can fail (Spurious Correlations , Simpson Paradox & Asymmetry In Causal Inference).","custom_excerpt":"This post is the second post of the series on Causal Machine Learning.  This blog post is based on the work of Judea Pearl. I will discuss how naive statistics can fail (Spurious Correlations , Simpson Paradox & Asymmetry In Causal Inference).","visibility":"public","created_at_pretty":"27 Dec 2022","published_at_pretty":"27 Dec 2022","updated_at_pretty":"18 Jan 2023","created_at":"2022-12-27T18:49:47.000+05:30","published_at":"2022-12-27T18:51:50.000+05:30","updated_at":"2023-01-18T17:41:55.000+05:30","meta_title":null,"meta_description":null,"og_description":null,"og_image":null,"og_title":null,"twitter_description":null,"twitter_image":null,"twitter_title":null,"authors":[{"slug":"amaljith","url":"http://localhost:2368/author/amaljith/","name":"Amaljith","bio":"Research Scholar @ IIT Kharagpur","cover_image":null,"profile_image":"http://localhost:2368/content/images/2022/09/Screenshot-from-2022-09-07-18-00-00.png","location":null,"website":null,"twitter":null,"facebook":null,"meta_title":null,"meta_description":null,"coverImageSharp":null,"profileImageSharp":null}],"primary_author":{"slug":"amaljith","url":"http://localhost:2368/author/amaljith/","name":"Amaljith","bio":"Research Scholar @ IIT Kharagpur","cover_image":null,"profile_image":"http://localhost:2368/content/images/2022/09/Screenshot-from-2022-09-07-18-00-00.png","location":null,"website":null,"twitter":null,"facebook":null,"meta_title":null,"meta_description":null,"coverImageSharp":null,"profileImageSharp":{"base":"Screenshot-from-2022-09-07-18-00-00.png","publicURL":"/static/28e31bfedd96b4afe90237d2c1f700c3/Screenshot-from-2022-09-07-18-00-00.png","imageMeta":{"width":316,"height":237},"childImageSharp":{"fluid":{"base64":"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAAPCAYAAADkmO9VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAEJElEQVQ4yyXO21fTBQDA8d9fUh5LQAW1eEImbIPJbSB3kZtijjSBxNvAGSAqlJZ64GAHIQ0VjkKICCiCQ8cYsYFjYwhsYxeGjJsopmEnH76dYw+f94/w2jmKfaSX9wtTrC1a+bBk5d83dlbnzFhHehnq62TBZeLjGyfvFqf4a9nGstuAx/oM96Qai76NieFu5u0jLLrNCCsOI47nat56J1hbtLH60kTfg5tUV6hQFeSSFLGTgtwcBtUteGc0zLoGmXNpmXU8Yd6txTOlxvq8hzmr/hNhyTaC06hmya3nn6VxHKMPKS08RFF+HgWK/aiO5JP3TQ4xMgn3mqt599bC8pyBRY+OxRkdXkc/rrE+POZ+nMY+hFXHKPPWP5m3a/i4Yqax9kfCREEkyKPJSk3mQFYmOelpRISGEBMuwahv4/WC4dNuwa3l1Us906O9jGs7sWg7EHQdv6NurWVmvJv33iHKTuSyft06tm3xJygwEIlIxDb/zUhFIvw2+HDrxkX+XjUx5x7A6+pn3qXBZnyIoacFTftNhKarlTxprsMz3sPbWR2jmiYiJSICt27lqwB/tmza+L8Af5KkYsYNHXhcvbidahy2x0ya2jFqm7lxqYxrF04jvLQMYB16jKazHrOulRWHjiJFOus/+5yAjX5s8vNhs58vfj5foooO43LJMaovncI1/ZhpWxcTlk6mRjrpa7uOrqsJYcaiY25imKftv6F52IBx4D61Z46y6Yv1+PpsYLOvD36+G0gJDuRcrIS0nTvY8XUAl66osDq60A3e5sHtXzCoW3CY+hHeeKdZdL5gytDDgsOAe1zL9KiawowEsoK2URi+HWWMmKvZuzibKuNcqoTiWBEXShToLHfo7K6h+9413DYzS/MehJnJYZxjOowDHdjHnjE53M2kUU1/zVmu742lOlNOkyKRmtzd1KkUtObFcacgmUetVTT0/kTDvQpemPoxDWmoralG8NrNvJqZwmbS8Xp5FrfNhMtuxtNei75Kia3xAmtdVZgaf6axvprKvXH8kB5FXd1lFMcyuVp7Hnd9E0+bb5Cdvgfhw4oXPq5hGR5kzGzkUVcPVeWlPK8tx/BrCfcrjnI+bx+KPSmcPJxP2QklFUolRfmHyf/uEEXK49y9XMndaxdJjpQirCy4WLCPoe38A4Nez8XySjISU5CFhCIO3k5wUDASkZiUuHj2Z2ahyN5HvuIgRUeOcUZZTOlJJeUlKgbU7Txta0AYbLnFqewMrpSepvhIITm700nblYg8IpJIWRjyiAjiI6JJjJKTGBtHcnwCqYlJpCelkJOWwfe533Jw337Kik9g1N5HqC9RcfyAgvTUPcRHyYmWypCFiJGKQggNDkYsEiEJEhEmCiE8REyYWIJMKiVCEoY8fCcJUTHEx8QgCw0l/8Be/gOTiTiUD46AHAAAAABJRU5ErkJggg==","aspectRatio":1.3333333333333333,"src":"/static/28e31bfedd96b4afe90237d2c1f700c3/6ccb0/Screenshot-from-2022-09-07-18-00-00.png","srcSet":"/static/28e31bfedd96b4afe90237d2c1f700c3/7d89d/Screenshot-from-2022-09-07-18-00-00.png 28w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/f4091/Screenshot-from-2022-09-07-18-00-00.png 55w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/6ccb0/Screenshot-from-2022-09-07-18-00-00.png 110w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/30481/Screenshot-from-2022-09-07-18-00-00.png 165w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/33bd6/Screenshot-from-2022-09-07-18-00-00.png 220w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/d398b/Screenshot-from-2022-09-07-18-00-00.png 316w","srcWebp":"/static/28e31bfedd96b4afe90237d2c1f700c3/8678c/Screenshot-from-2022-09-07-18-00-00.webp","srcSetWebp":"/static/28e31bfedd96b4afe90237d2c1f700c3/59cda/Screenshot-from-2022-09-07-18-00-00.webp 28w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/7da75/Screenshot-from-2022-09-07-18-00-00.webp 55w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/8678c/Screenshot-from-2022-09-07-18-00-00.webp 110w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/f282e/Screenshot-from-2022-09-07-18-00-00.webp 165w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/a7b21/Screenshot-from-2022-09-07-18-00-00.webp 220w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/fb2b8/Screenshot-from-2022-09-07-18-00-00.webp 316w","sizes":"(max-width: 110px) 100vw, 110px"}}}},"primary_tag":{"slug":"machine-learning","url":"http://localhost:2368/tag/machine-learning/","name":"Machine Learning","visibility":"public","feature_image":null,"description":null,"meta_title":null,"meta_description":null,"featureImageSharp":null},"tags":[{"slug":"machine-learning","url":"http://localhost:2368/tag/machine-learning/","name":"Machine Learning","visibility":"public","feature_image":null,"description":null,"meta_title":null,"meta_description":null,"featureImageSharp":null},{"slug":"artificial-intelligence","url":"http://localhost:2368/tag/artificial-intelligence/","name":"Artificial Intelligence","visibility":"public","feature_image":null,"description":null,"meta_title":null,"meta_description":null,"featureImageSharp":null},{"slug":"causal-machine-learning","url":"http://localhost:2368/tag/causal-machine-learning/","name":"Causal Machine Learning","visibility":"public","feature_image":null,"description":null,"meta_title":null,"meta_description":null,"featureImageSharp":null}],"plaintext":"This post is the second post of the series on Causal Machine Learning. In the last blog post i have given a brief introduction on Causal Machine Learning. In this we will uncover the WHY !. This blog post is based on the work of Judea Pearl. I will discuss how naive statistics can fail (Spurious Correlations , Simpson Paradox & Asymmetry In Causal Inference). As always i will try to keep the things as simple as possible. So stay with me , Enjoy reading !\n\n\nHow (Naive) Statistics Can Fail Us !\n\n\n\n\n1. Spurious Correlations\n\n\n\n\n\nCorrellation Is Not Causation !!!\n\n\n\nI know you are bored of this like every other, hearing this oft-repeated saying “correlation does not imply causation”. I will formally illustrate why this is in this post.\n\nIn the website, “Spurious Correlations” by Tyler Vigen, we can explore a wide variety of statistical correlations (that are due to chance) with no causal implications. One of them is listed below.\n\n\nWe clearly know No of people who drowned while in a swimming pool has nothing to do with the power generated by US nuclear power plants !!. You can find more interesting “Spurious Correlations” from Tyler Vigen website.\n\n\n\n2. Simpson's Paradox\n\n\n\n\nSpurious correlations are well-known in statistics, so it’s easy (Somewhat !) to be on the lookout for it. Lets see a little known paradox in statistics.\n\nSimpson's paradox is a phenomenon in probability and statistics in which a trend appears in several groups of data but disappears or reverses when the groups are combined. Lets look at an example of Simpson's paradox from Wikipedia itself.\n\n\nIn both 1995 and 1996, Justice had a higher batting average (in bold type) than Jeter did. However, when the two baseball seasons are combined, Jeter shows a higher batting average than Justice.\n\n\n\n\nSame data give contradictory conclusions depending on how you look at them! .\n\n\n\nSimpson’s paradox highlights that how you look at your data matters. So the question becomes, how do we partition data? Although there is no standard method in statistics for this, causal inference provides a formalism to handle this problem. It all boils down to causal effects, which quantify the impact a variable has on another variable after adjusting for the appropriate confounders.\n\nLet us look at an example from “The Book of Why: The New Science of Cause and Effect” by Judea Pearl , and a post the legend himself posted in his twitter handle.\n\nConsider the below study that measures weekly exercise and cholesterol in various age groups. When we plot exercise on the X-axis and cholesterol on the Y-axis and segregate by age, as in left side of Fig , we see that there is a general trend downward in each group; the more young people exercise, the lower their cholesterol is, and the same applies for middle-aged people and the elderly. If, however, we use the same scatter plot, but we don’t segregate by gender (as in right side of Fig), we see a general trend upward; the more a person exercises, the higher their cholesterol is.\n\n\n\n\n\nExcercise appears to be beneficial in each age group but harmful in the population as a whole !!!\n\n\n\nTo resolve this problem, we once again turn to the story behind the data. If we know that older people, who are more likely to exercise  are also more likely to have high cholesterol regardless of exercise, then the reversal is easily explained, and easily resolved. Age is a common cause of both treatment (exercise) and outcome (cholesterol). So we should look at the age-segregated data in order to compare same-age people and thereby eliminate the possibility that the high exercisers in each group we examine are more likely to have high cholesterol due to their age, and not due to exercising.\n\nHowever, please do not get confused , segregated data does not always give the correct answer.\n\nLets look at another example from the Causal Inference In Statistics book by Pearl.\n\nIn the classical example used by Simpson (1951), a group of sick patients are given the option to try a new drug. Among those who took the drug, a lower percentage recovered than among those who did not. However, when we partition by gender, we see that more men taking the drug recover than do men are not taking the drug, and more women taking the drug recover than do women are not taking the drug!\n\n\nIn other words, the drug appears to help men and women, but hurt the general population. It seems nonsensical, or even impossible—which is why, of course, it is considered a paradox. Some people find it hard to believe that numbers could even be combined in such a way.\n\n\n\n\nThe data seem to say that if we know the patient’s gender male or female we can prescribe the drug, but if the gender is unknown we should not! Obviously, that conclusion is ridiculous. If the drug helps men and women, it must help anyone; our lack of knowledge of the patient’s gender cannot make the drug harmful.\n\n\n\nGiven the results of this study, then, should a doctor prescribe the drug for a woman? A man? A patient of unknown gender? Or consider a policy maker who is evaluating the drug’s overall effectiveness on the population. Should he/she use the recovery rate for the general population? Or should he/she use the recovery rates for the gendered sub-populations?\n\nThe answer is nowhere to be found in simple statistics.\n\nIn order to decide whether the drug will harm or help a patient, we first have to understand the story behind the data , the causal mechanism that led to, or generated, the results we see. For instance, suppose we knew an additional fact: Estrogen has a negative effect on recovery, so women are less likely to recover than men, regardless of the drug. In addition, as we can see from the data, women are significantly more likely to take the drug than men are. So, the reason the drug appears to be harmful overall is that, if we select a drug user at random, that person is more likely to be a woman and hence less likely to recover than a random person who does not take the drug. Put differently, being a woman is a common cause of both drug taking and failure to recover. Therefore, to assess the effectiveness, we need to compare subjects of the same gender, thereby ensuring that any difference in recovery rates between those who take the drug and those who do not is not ascribable to estrogen.\n\nIn fact, as statistics textbooks have traditionally (and correctly) warned students, correlation is not causation, so there is no statistical method that can determine the causal story from the data alone. Consequently, there is no statistical method that can aid in our decision.\n\n3. Symmetry\n\n\nThe problems with traditional statistics when thinking about causality stems from a fundamental property of algebra, symmetry . The left-hand side of an equation equals the right-hand side (that’s the point of algebra). The equal sign implies symmetry. However, causality is fundamentally asymmetric i.e. causes lead to effects and not the other way around.\n\nThis distinction further implies that causal relations cannot be expressed in the language of probability and, hence, that any mathematical approach to causal analysis must acquire new notation – probability calculus is insufficient. To illustrate, the syntax of probability calculus does not permit us to express the simple fact that “symptoms do not cause diseases,” let alone draw mathematical conclusions from such facts. All we can say is that two events are dependent—meaning that if we find one, we can expect to encounter the other, but we cannot distinguish statistical dependence, quantified by the conditional probability P(disease|symptom) from causal dependence, for which we have no expression in standard probability calculus.\n\nLet’s look at a simple example taken from Judea Pearl book itself . Suppose we model the relationship between a disease and the symptoms it produces, with the expression below. Y represents the severity of the symptoms, X the severity of the disease, m is the connection between the two, and b represents all other factors.\n\n\n\n\nUsing the rules of algebra we can invert the equation above to get the following expression.\n\n\n\n\nHere’s the problem, if we interpret the first equation as diseases cause symptoms, then we have to interpret the second equation as symptoms cause diseases! Which is of course not true.\n\nNote : Linear relations are used here for illustration purposes only; they do not represent typical disease-symptom relations but illustrate the historical development of path analysis.\n\n\nWhy Association (Correlation) Is Not Causation ?\n\n\nBefore moving to the next set of blog posts , I should precisely define what a correlation is. I know you all are bored with listening this oft-repeated saying \"Correlation is not causation\" , so am i. So lets sort this out before moving to anything else!\n\nBefore moving ahead lets clarify one more thing : “Correlation” is often colloquially used as a synonym for statistical dependence. However, “correlation” is technically only a measure of linear statistical dependence. We will largely be using the term association to refer to statistical dependence from now on.\n\nLets take an example from Brady Neal's Causal Course book.\n\nSay you happen upon some data that relates wearing shoes to bed and\nwaking up with a headache, as one does. It turns out that most times\nthat someone wears shoes to bed, that person wakes up with a headache.\nAnd most times someone doesn’t wear shoes to bed, that person doesn’t\nwake up with a headache. It is not uncommon for people to interpret\ndata like this (with associations) as meaning that wearing shoes to bed\ncauses people to wake up with headaches, especially if they are looking\nfor a reason to justify not wearing shoes to bed.\n\nWe can explain how wearing shoes to bed and headaches are associated\nwithout either being a cause of the other. It turns out that they are\nboth caused by a common cause: drinking the night before. This kind of variables are called \"confounder\" or lurking variable. We will call this kind of association confounding association since the association is facilitated by a confounder.\n\nThe main problem motivating causal inference is that association is not causation.\nIf the two were the same, then causal inference would be easy. Traditional statistics and machine learning would already have causal inference solved, as measuring causation would be as simple as just looking at measures such as correlation and predictive performance in data.\n\nLets look at another example. Lets attempt to determine the causal effect of vitamin C intake on resistance to sickness. Let X be defined as a binary indicator representing if this subject intakes vitamin C and let Y be a binary indicator of being healthy (not getting sick). X is also referred to as the ‘treatment’ in a more general setting. Now, let C1 be the value of Y if X=1 (vitamin C is taken) and C0 be the value of Y if X=0 (vitamin C is not taken). We call C0 and C1 the potential outcomes of this experiment.\n\n\nFor a single person, the causal effect of taking vitamin C in this context would be the difference between the expected outcome of taking vitamin C and the expected outcome of not taking vitamin C.\n\nCausal Effect = E(C1) – E(C0)\n\n\n\nUnfortunately, we can only ever observe one of the possible outcomes C0 or C1. We cannot perfectly reset all conditions to see the result of the opposite treatment. Instead, we can use multiple samples and calculate the association between Vitamin C and being healthy.\n\nAssociation = E(Y|X=1) – E(Y|X=0)\n\n\n\nAssociation as being   (1+1+1+1)/4 – (0+0+0+0)/4 = 1\n\nCausal effect, using the unobserved outcomes*, as being (4*0 + 4*1)/4 – (4*0 + 4*1)/4 = 0\n\nWe just calculated that, in this case, association does not equal causation. Observationally, there seems to be a perfect association between taking Vitamin C intake and being healthy. However, we can see there is no causal effect because we are privileged with the values of the unobserved outcomes. This inequality could be explained by considering that the people that stayed healthy practiced healthy habits which included taking Vitamin C.\n\nOkay, one more motivating example :\n\nIn response to a large study that studied the relationship between income and life expectancy, Vox published an article titled “Want to live longer, even if you’re poor? Then move to a big city in California” (Klein, 2016). However, as is implied by the title of the study “The Association Between Income and Life Expectancy in the United States, 2001-2014”, the study did not presume to make this recommendation and in fact the closest statement made to the Vox recommendation was “… the strongest pattern in the data was that low-income individuals tend to live longest (and have more healthful behaviors) in cities with highly educated populations, high incomes, and high levels of government expenditures, such as New York, New York, and San Francisco, California.” (Chetty et al., 2016).\n\nSimilar to the example regarding vitamin C and health, this study only found associative effects. However, just like it is incorrect to say that vitamin C causes a person to be healthy, it is also incorrect to say that moving to California will cause you to live longer.\n\nThat's all for now ! , Hope you enjoyed reading so far !\n\nIn the next blog post, we will further investigate the differences between association and causation, by starting with Pearl’s three-level causal hierarchy. That will be much interesting to watch out for !\n\n\nReferences\n\n\n 1. Simpson's Paradox - Technical Report By Judea Pearl\n 2. Causal Inferences In Statistics Judea Pearl - Chapter 1\n 3. Course : Causal Course - Brady Neal\n","html":"<p>This post is the second post of the series on <a href=\"http://localhost:2368/tag/causal-machine-learning\">Causal Machine Learning</a>. In the <a href=\"http://localhost:2368/introduction-to-causality-in-machine-learning/\">last blog post</a> i have given a brief introduction on Causal Machine Learning. In this we will uncover the WHY !. This blog post is based on the work of Judea Pearl. I will discuss how naive statistics can fail (Spurious Correlations , Simpson Paradox &amp; Asymmetry In Causal Inference). As always i will try to keep the things as simple as possible. So stay with me , Enjoy reading !</p><!--kg-card-begin: markdown--><h3 id=\"how-naive-statistics-can-fail-us\">How (Naive) Statistics Can Fail Us !</h3>\n<!--kg-card-end: markdown--><p></p><!--kg-card-begin: markdown--><h4 id=\"1-spurious-correlations\">1. Spurious Correlations</h4>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><blockquote>\n<p>Correllation Is Not Causation !!!</p>\n</blockquote>\n<!--kg-card-end: markdown--><p>I know you are bored of this like every other, hearing this oft-repeated saying “correlation does not imply causation”. I will formally illustrate why this is in this post. </p><p>In the website, <a href=\"http://www.tylervigen.com/spurious-correlations\">“Spurious Correlations”</a> by Tyler Vigen, we can explore a wide variety of statistical correlations (that are due to chance) with no causal implications. One of them is listed below.</p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/210524097-c10421ba-b2f5-44fd-93c8-311a365e78c7.jpg\" align=\"center\" alt=\"Spurious Correllation\" width=\"720\" height=\"360\"/>\n<!--kg-card-end: markdown--><p>We clearly know No of people who drowned while in a swimming pool has nothing to do with the power generated by US nuclear power plants !!. You can find more interesting <a href=\"http://www.tylervigen.com/spurious-correlations\">“Spurious Correlations”</a> from Tyler Vigen website.</p><p></p><!--kg-card-begin: markdown--><h4 id=\"2-simpsons-paradox\">2. Simpson's Paradox</h4>\n<!--kg-card-end: markdown--><p></p><p>Spurious correlations are well-known in statistics, so it’s <em>easy (Somewhat !)</em> to be on the lookout for it. Lets see a little known paradox in statistics. </p><p><strong><a href=\"https://en.wikipedia.org/wiki/Simpson%27s_paradox\">Simpson's paradox</a></strong> is a phenomenon in <a href=\"https://en.wikipedia.org/wiki/Probability\">probability</a> and <a href=\"https://en.wikipedia.org/wiki/Statistics\">statistics</a> in which a trend appears in several groups of data but disappears or reverses when the groups are combined. Lets look at an example of Simpson's paradox from Wikipedia itself.</p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/210533109-6f2a4803-ae3a-4cbb-82fa-38dd43b1a6a6.png\" align=\"center\" alt=\"Simpson's Paradox\" width=\"719\" height=\"276\"/>\n<!--kg-card-end: markdown--><p>In both 1995 and 1996, Justice had a higher batting average (in bold type) than Jeter did. However, when the two baseball seasons are combined, Jeter shows a higher batting average than Justice.</p><!--kg-card-begin: markdown--><blockquote>\n<p>Same data give contradictory conclusions depending on how you look at them! .</p>\n</blockquote>\n<!--kg-card-end: markdown--><p>Simpson’s paradox highlights that <em>how you look at your data matters.</em> So the question becomes, how do we partition data? Although there is no standard method in statistics for this, <em>causal inference provides a formalism to handle this problem</em>. It all boils down to causal effects, which quantify the impact a variable has on another variable after adjusting for the appropriate confounders. </p><p>Let us look at an example from “<a href=\"https://www.basicbooks.com/titles/judea-pearl/the-book-of-why/9780465097609/\">The Book of Why: The New Science of Cause and Effect</a>” by Judea Pearl , and a post the legend himself posted in his <a href=\"https://twitter.com/yudapearl/status/1411842797376659457\">twitter handle.</a></p><p>Consider the below study that measures weekly exercise and cholesterol in various age groups. When we plot exercise on the X-axis and cholesterol on the Y-axis and segregate by age, as in left side of Fig , we see that there is a general trend downward in each group; the more young people exercise, the lower their cholesterol is, and the same applies for middle-aged people and the elderly. If, however, we use the same scatter plot, but we don’t segregate by gender (as in right side of Fig), we see a general trend upward; the more a person exercises, the higher their cholesterol is. </p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/210539152-32dac61b-7722-41c8-8d1f-6e3fea1d33f3.jpg\" align=\"center\" alt=\"Book Of Why Simpson's Paradox\" width=\"580\" height=\"362\"/>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><blockquote>\n<p>Excercise appears to be beneficial in each age group but harmful in the population as a whole !!!</p>\n</blockquote>\n<!--kg-card-end: markdown--><p>To resolve this problem, we once again turn to the story behind the data. If we know that older people, who are more likely to exercise  are also more likely to have high cholesterol regardless of exercise, then the reversal is easily explained, and easily resolved. Age is a common cause of both treatment (exercise) and outcome (cholesterol). So we should look at the age-segregated data in order to compare same-age people and thereby eliminate the possibility that the high exercisers in each group we examine are more likely to have high cholesterol due to their age, and not due to exercising.</p><p><em><strong>However, please do not get confused , segregated data does not always give the correct answer.</strong></em></p><p>Lets look at another example from the Causal Inference In Statistics book by Pearl. </p><p>In the classical example used by Simpson (1951), a group of sick patients are given the option to try a new drug. Among those who took the drug, a lower percentage recovered than among those who did not. However, when we partition by gender, we see that more men taking the drug recover than do men are not taking the drug, and more women taking the drug recover than do women are not taking the drug! </p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/210541750-32d1eddf-be44-4dab-bea7-3246d893baad.jpeg\" align=\"center\" alt=\"Book Of Why Simpson's Paradox\" width=\"720\" height=\"182\"/>\n<!--kg-card-end: markdown--><p><em><strong>In other words, the drug appears to help men and women, but hurt the general population. It seems nonsensical, or even impossible—which is why, of course, it is considered a paradox.</strong></em> Some people find it hard to believe that numbers could even be combined in such a way. </p><!--kg-card-begin: markdown--><blockquote>\n<p>The data seem to say that if we know the patient’s gender male or female we can prescribe the drug, but if the gender is unknown we should not! Obviously, that conclusion is ridiculous. If the drug helps men and women, it must help anyone; our lack of knowledge of the patient’s gender cannot make the drug harmful.</p>\n</blockquote>\n<!--kg-card-end: markdown--><p>Given the results of this study, then, should a doctor prescribe the drug for a woman? A man? A patient of unknown gender? Or consider a policy maker who is evaluating the drug’s overall effectiveness on the population. Should he/she use the recovery rate for the general population? Or should he/she use the recovery rates for the gendered sub-populations?</p><p><strong>The answer is nowhere to be found in simple statistics.</strong></p><p>In order to decide whether the drug will harm or help a patient, <em>we first have to understand the story behind the data , the causal mechanism that led to, or generated, the results we see. </em>For instance, suppose we knew an additional fact: <em>Estrogen has a negative effect on recovery, so women are less likely to recover than men, regardless of the drug.</em> In addition, as we can see from the data, women are significantly more likely to take the drug than men are. So, the reason the drug appears to be harmful overall is that, if we select a drug user at random, that person is more likely to be a woman and hence less likely to recover than a random person who does not take the drug. <em>Put differently, being a woman is a common cause of both drug taking and failure to recover. Therefore, to assess the effectiveness, we need to compare subjects of the same gender, thereby ensuring that any difference in recovery rates between those who take the drug and those who do not is not ascribable to estrogen.</em></p><p>In fact, as statistics textbooks have traditionally (and correctly) warned students, correlation is not causation, so there is no statistical method that can determine the causal story from the data alone. Consequently, there is no statistical method that can aid in our decision.</p><!--kg-card-begin: markdown--><h4 id=\"3-symmetry\">3. Symmetry</h4>\n<!--kg-card-end: markdown--><p>The problems with traditional statistics when thinking about causality stems from a fundamental property of algebra, symmetry . The left-hand side of an equation equals the right-hand side (that’s the point of algebra). The equal sign implies symmetry. However, causality is fundamentally asymmetric i.e. causes lead to effects and not the other way around.</p><p>This distinction further implies that causal relations cannot be expressed in the language of probability and, hence, that any mathematical approach to causal analysis must acquire new notation – probability calculus is insufficient. To illustrate, the syntax of probability calculus does not permit us to express the simple fact that “symptoms do not cause diseases,” let alone draw mathematical conclusions from such facts. All we can say is that two events are dependent—meaning that if we find one, we can expect to encounter the other, but we cannot distinguish statistical dependence, quantified by the conditional probability <em>P(disease|symptom)</em> from causal dependence, for which we have no expression in standard probability calculus.</p><p>Let’s look at a simple example taken from Judea Pearl book itself . Suppose we model the relationship between a disease and the symptoms it produces, with the expression below. Y represents the severity of the symptoms, X the severity of the disease, m is the connection between the two, and b represents all other factors.</p><!--kg-card-begin: markdown--><p><img src=\"https://user-images.githubusercontent.com/33357428/210627825-c1e11271-a609-4bd7-b679-42dca0a40aee.jpg\" alt=\"SymptomsDisease\" loading=\"lazy\"></p>\n<!--kg-card-end: markdown--><p>Using the rules of algebra we can invert the equation above to get the following expression.</p><!--kg-card-begin: markdown--><p><img src=\"https://user-images.githubusercontent.com/33357428/210628024-cf038674-9241-4076-9422-1e45b6d1199e.jpg\" alt=\"SymptomDisease2\" loading=\"lazy\"></p>\n<!--kg-card-end: markdown--><p>Here’s the problem, if we interpret the first equation as <em>diseases cause symptoms</em>, then we have to interpret the second equation as <em>symptoms cause diseases!</em> Which is of course not true.</p><p><em>Note : Linear relations are used here for illustration purposes only; they do not represent typical disease-symptom relations but illustrate the historical development of path analysis.</em></p><!--kg-card-begin: markdown--><h3 id=\"why-association-correlation-is-not-causation\">Why Association (Correlation) Is Not Causation ?</h3>\n<!--kg-card-end: markdown--><p>Before moving to the next set of blog posts , I should precisely define what a correlation is. I know you all are bored with listening this oft-repeated saying \"Correlation is not causation\" , so am i. So lets sort this out before moving to anything else!</p><p>Before moving ahead lets clarify one more thing : “Correlation” is often colloquially used as a synonym for statistical dependence. However, “correlation” is technically only a measure of linear statistical dependence. We will largely be using the term association to refer to statistical dependence from now on.</p><p>Lets take an example from Brady Neal's Causal Course book.</p><p>Say you happen upon some data that relates wearing shoes to bed and<br>waking up with a headache, as one does. It turns out that most times<br>that someone wears shoes to bed, that person wakes up with a headache.<br>And most times someone doesn’t wear shoes to bed, that person doesn’t<br>wake up with a headache. It is not uncommon for people to interpret<br>data like this (with associations) as meaning that <em>wearing shoes to bed<br>causes people to wake up with headaches</em>, especially if they are looking<br>for a reason to justify not wearing shoes to bed.</p><p>We can explain <em>how wearing shoes to bed and headaches are associated<br>without either being a cause of the other</em>. I<em>t turns out that they are<br>both caused by a common cause: <u>drinking the night before.</u> </em>This kind of variables are called \"<strong><em>confounder\" </em></strong><em>or lurking variable. </em>We will call this kind of association confounding association since the association is facilitated by a confounder.</p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/210763817-267c4af8-6ac6-4392-8dcb-3196a94ed19a.jpg\" align=\"center\" alt=\"Headache And Shoes\" width=\"338\" height=\"285\"/><!--kg-card-end: markdown--><p>The main problem motivating causal inference is that association is not causation.<br>If the two were the same, then causal inference would be easy. Traditional statistics and machine learning would already have causal inference solved, as measuring causation would be as simple as just looking at measures such as correlation and predictive performance in data.</p><p>Lets look at another example. Lets attempt to determine the causal effect of vitamin C intake on resistance to sickness. Let X be defined as a binary indicator representing if this subject intakes vitamin C and let Y be a binary indicator of being healthy (not getting sick). X is also referred to as the ‘treatment’ in a more general setting. Now, let C<sub>1</sub> be the value of Y if X=1 (vitamin C is taken) and C<sub>0</sub> be the value of Y if X=0 (vitamin C is not taken). We call C<sub>0</sub> and C<sub>1</sub> the potential outcomes of this experiment.</p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/210771012-3809777d-83cb-4027-a73c-495690ec5876.jpg\" align=\"center\" alt=\"Association And Causation\" width=\"567\" height=\"330\"/>\n<!--kg-card-end: markdown--><p>For a single person, the causal effect of taking vitamin C in this context would be the difference between the expected outcome of taking vitamin C and the expected outcome of not taking vitamin C.</p><!--kg-card-begin: markdown--><pre><code>Causal Effect = E(C1) – E(C0)\n</code></pre>\n<!--kg-card-end: markdown--><p>Unfortunately, we can only ever observe one of the possible outcomes C<sub>0</sub> or C<sub>1</sub>. We cannot perfectly reset all conditions to see the result of the opposite treatment. Instead, we can use multiple samples and calculate the association between Vitamin C and being healthy.</p><!--kg-card-begin: markdown--><pre><code>Association = E(Y|X=1) – E(Y|X=0)\n</code></pre>\n<!--kg-card-end: markdown--><p>Association as being   (1+1+1+1)/4 – (0+0+0+0)/4 = 1</p><p>Causal effect, using the unobserved outcomes*, as being (4*0 + 4*1)/4 – (4*0 + 4*1)/4 = 0</p><p>We just calculated that, in this case, <strong>association does not equal causation. </strong>Observationally, there seems to be a <em>perfect association between taking Vitamin C intake and being healthy. </em>However, we can see there is no causal effect because we are privileged with the values of the unobserved outcomes. <em>This inequality could be explained by considering that the people that stayed healthy practiced healthy habits which included taking Vitamin C.</em></p><p>Okay, one more motivating example :</p><p>In response to a large study that studied the relationship between income and life expectancy, Vox published an article titled <em><a href=\"https://www.vox.com/2016/4/13/11420230/life-expectancy-income.\">“Want to live longer, even if you’re poor? Then move to a big city in California” (Klein, 2016).</a></em> However, as is implied by the title of the study “The Association Between Income and Life Expectancy in the United States, 2001-2014”, the study did not presume to make this recommendation and in fact the closest statement made to the Vox recommendation was <strong><em>“… the strongest pattern in the data was that low-income individuals tend to live longest (and have more healthful behaviors) in cities with highly educated populations, high incomes, and high levels of government expenditures, such as New York, New York, and San Francisco, California.”</em></strong> (Chetty et al., 2016).</p><p>Similar to the example regarding vitamin C and health, this study only found associative effects. However, just like it is incorrect to say that vitamin C causes a person to be healthy, it is also incorrect to say that moving to California will cause you to live longer.</p><p>That's all for now ! , Hope you enjoyed reading so far !</p><p>In the next blog post, we will further investigate the differences between association and causation, by starting with Pearl’s three-level causal hierarchy. That will be much interesting to watch out for !</p><!--kg-card-begin: markdown--><h3 id=\"references\">References</h3>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><ol>\n<li><a href=\"https://ftp.cs.ucla.edu/pub/stat_ser/r414.pdf\">Simpson's Paradox - Technical Report By Judea Pearl</a></li>\n<li><a href=\"https://bayes.cs.ucla.edu/PRIMER/ch1-preview.pdf\">Causal Inferences In Statistics Judea Pearl - Chapter 1</a></li>\n<li><a href=\"https://www.bradyneal.com/causal-inference-course\">Course : Causal Course - Brady Neal</a></li>\n</ol>\n<!--kg-card-end: markdown-->","url":"http://localhost:2368/causal-machine-learning-part-2/","canonical_url":null,"uuid":"7454be0f-fe1e-4d96-9bbe-d71b3c269fb0","codeinjection_foot":null,"codeinjection_head":null,"codeinjection_styles":null,"comment_id":"63aaf0f3bdc6867fe35525cb","reading_time":10,"send_email_when_published":null,"email_subject":null,"childHtmlRehype":{"html":"<p>This post is the second post of the series on <a href=\"/tag/causal-machine-learning\">Causal Machine Learning</a>. In the <a href=\"/introduction-to-causality-in-machine-learning/\">last blog post</a> i have given a brief introduction on Causal Machine Learning. In this we will uncover the WHY !. This blog post is based on the work of Judea Pearl. I will discuss how naive statistics can fail (Spurious Correlations , Simpson Paradox &#x26; Asymmetry In Causal Inference). As always i will try to keep the things as simple as possible. So stay with me , Enjoy reading !</p><!--kg-card-begin: markdown--><h3 id=\"how-naive-statistics-can-fail-us\">How (Naive) Statistics Can Fail Us !</h3>\n<!--kg-card-end: markdown--><p></p><!--kg-card-begin: markdown--><h4 id=\"1-spurious-correlations\">1. Spurious Correlations</h4>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><blockquote>\n<p>Correllation Is Not Causation !!!</p>\n</blockquote>\n<!--kg-card-end: markdown--><p>I know you are bored of this like every other, hearing this oft-repeated saying “correlation does not imply causation”. I will formally illustrate why this is in this post. </p><p>In the website, <a href=\"http://www.tylervigen.com/spurious-correlations\">“Spurious Correlations”</a> by Tyler Vigen, we can explore a wide variety of statistical correlations (that are due to chance) with no causal implications. One of them is listed below.</p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/210524097-c10421ba-b2f5-44fd-93c8-311a365e78c7.jpg\" align=\"center\" alt=\"Spurious Correllation\" width=\"720\" height=\"360\">\n<!--kg-card-end: markdown--><p>We clearly know No of people who drowned while in a swimming pool has nothing to do with the power generated by US nuclear power plants !!. You can find more interesting <a href=\"http://www.tylervigen.com/spurious-correlations\">“Spurious Correlations”</a> from Tyler Vigen website.</p><p></p><!--kg-card-begin: markdown--><h4 id=\"2-simpsons-paradox\">2. Simpson's Paradox</h4>\n<!--kg-card-end: markdown--><p></p><p>Spurious correlations are well-known in statistics, so it’s <em>easy (Somewhat !)</em> to be on the lookout for it. Lets see a little known paradox in statistics. </p><p><strong><a href=\"https://en.wikipedia.org/wiki/Simpson%27s_paradox\">Simpson's paradox</a></strong> is a phenomenon in <a href=\"https://en.wikipedia.org/wiki/Probability\">probability</a> and <a href=\"https://en.wikipedia.org/wiki/Statistics\">statistics</a> in which a trend appears in several groups of data but disappears or reverses when the groups are combined. Lets look at an example of Simpson's paradox from Wikipedia itself.</p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/210533109-6f2a4803-ae3a-4cbb-82fa-38dd43b1a6a6.png\" align=\"center\" alt=\"Simpson&#x27;s Paradox\" width=\"719\" height=\"276\">\n<!--kg-card-end: markdown--><p>In both 1995 and 1996, Justice had a higher batting average (in bold type) than Jeter did. However, when the two baseball seasons are combined, Jeter shows a higher batting average than Justice.</p><!--kg-card-begin: markdown--><blockquote>\n<p>Same data give contradictory conclusions depending on how you look at them! .</p>\n</blockquote>\n<!--kg-card-end: markdown--><p>Simpson’s paradox highlights that <em>how you look at your data matters.</em> So the question becomes, how do we partition data? Although there is no standard method in statistics for this, <em>causal inference provides a formalism to handle this problem</em>. It all boils down to causal effects, which quantify the impact a variable has on another variable after adjusting for the appropriate confounders. </p><p>Let us look at an example from “<a href=\"https://www.basicbooks.com/titles/judea-pearl/the-book-of-why/9780465097609/\">The Book of Why: The New Science of Cause and Effect</a>” by Judea Pearl , and a post the legend himself posted in his <a href=\"https://twitter.com/yudapearl/status/1411842797376659457\">twitter handle.</a></p><p>Consider the below study that measures weekly exercise and cholesterol in various age groups. When we plot exercise on the X-axis and cholesterol on the Y-axis and segregate by age, as in left side of Fig , we see that there is a general trend downward in each group; the more young people exercise, the lower their cholesterol is, and the same applies for middle-aged people and the elderly. If, however, we use the same scatter plot, but we don’t segregate by gender (as in right side of Fig), we see a general trend upward; the more a person exercises, the higher their cholesterol is. </p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/210539152-32dac61b-7722-41c8-8d1f-6e3fea1d33f3.jpg\" align=\"center\" alt=\"Book Of Why Simpson&#x27;s Paradox\" width=\"580\" height=\"362\">\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><blockquote>\n<p>Excercise appears to be beneficial in each age group but harmful in the population as a whole !!!</p>\n</blockquote>\n<!--kg-card-end: markdown--><p>To resolve this problem, we once again turn to the story behind the data. If we know that older people, who are more likely to exercise  are also more likely to have high cholesterol regardless of exercise, then the reversal is easily explained, and easily resolved. Age is a common cause of both treatment (exercise) and outcome (cholesterol). So we should look at the age-segregated data in order to compare same-age people and thereby eliminate the possibility that the high exercisers in each group we examine are more likely to have high cholesterol due to their age, and not due to exercising.</p><p><em><strong>However, please do not get confused , segregated data does not always give the correct answer.</strong></em></p><p>Lets look at another example from the Causal Inference In Statistics book by Pearl. </p><p>In the classical example used by Simpson (1951), a group of sick patients are given the option to try a new drug. Among those who took the drug, a lower percentage recovered than among those who did not. However, when we partition by gender, we see that more men taking the drug recover than do men are not taking the drug, and more women taking the drug recover than do women are not taking the drug! </p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/210541750-32d1eddf-be44-4dab-bea7-3246d893baad.jpeg\" align=\"center\" alt=\"Book Of Why Simpson&#x27;s Paradox\" width=\"720\" height=\"182\">\n<!--kg-card-end: markdown--><p><em><strong>In other words, the drug appears to help men and women, but hurt the general population. It seems nonsensical, or even impossible—which is why, of course, it is considered a paradox.</strong></em> Some people find it hard to believe that numbers could even be combined in such a way. </p><!--kg-card-begin: markdown--><blockquote>\n<p>The data seem to say that if we know the patient’s gender male or female we can prescribe the drug, but if the gender is unknown we should not! Obviously, that conclusion is ridiculous. If the drug helps men and women, it must help anyone; our lack of knowledge of the patient’s gender cannot make the drug harmful.</p>\n</blockquote>\n<!--kg-card-end: markdown--><p>Given the results of this study, then, should a doctor prescribe the drug for a woman? A man? A patient of unknown gender? Or consider a policy maker who is evaluating the drug’s overall effectiveness on the population. Should he/she use the recovery rate for the general population? Or should he/she use the recovery rates for the gendered sub-populations?</p><p><strong>The answer is nowhere to be found in simple statistics.</strong></p><p>In order to decide whether the drug will harm or help a patient, <em>we first have to understand the story behind the data , the causal mechanism that led to, or generated, the results we see. </em>For instance, suppose we knew an additional fact: <em>Estrogen has a negative effect on recovery, so women are less likely to recover than men, regardless of the drug.</em> In addition, as we can see from the data, women are significantly more likely to take the drug than men are. So, the reason the drug appears to be harmful overall is that, if we select a drug user at random, that person is more likely to be a woman and hence less likely to recover than a random person who does not take the drug. <em>Put differently, being a woman is a common cause of both drug taking and failure to recover. Therefore, to assess the effectiveness, we need to compare subjects of the same gender, thereby ensuring that any difference in recovery rates between those who take the drug and those who do not is not ascribable to estrogen.</em></p><p>In fact, as statistics textbooks have traditionally (and correctly) warned students, correlation is not causation, so there is no statistical method that can determine the causal story from the data alone. Consequently, there is no statistical method that can aid in our decision.</p><!--kg-card-begin: markdown--><h4 id=\"3-symmetry\">3. Symmetry</h4>\n<!--kg-card-end: markdown--><p>The problems with traditional statistics when thinking about causality stems from a fundamental property of algebra, symmetry . The left-hand side of an equation equals the right-hand side (that’s the point of algebra). The equal sign implies symmetry. However, causality is fundamentally asymmetric i.e. causes lead to effects and not the other way around.</p><p>This distinction further implies that causal relations cannot be expressed in the language of probability and, hence, that any mathematical approach to causal analysis must acquire new notation – probability calculus is insufficient. To illustrate, the syntax of probability calculus does not permit us to express the simple fact that “symptoms do not cause diseases,” let alone draw mathematical conclusions from such facts. All we can say is that two events are dependent—meaning that if we find one, we can expect to encounter the other, but we cannot distinguish statistical dependence, quantified by the conditional probability <em>P(disease|symptom)</em> from causal dependence, for which we have no expression in standard probability calculus.</p><p>Let’s look at a simple example taken from Judea Pearl book itself . Suppose we model the relationship between a disease and the symptoms it produces, with the expression below. Y represents the severity of the symptoms, X the severity of the disease, m is the connection between the two, and b represents all other factors.</p><!--kg-card-begin: markdown--><p><img src=\"https://user-images.githubusercontent.com/33357428/210627825-c1e11271-a609-4bd7-b679-42dca0a40aee.jpg\" alt=\"SymptomsDisease\" loading=\"lazy\"></p>\n<!--kg-card-end: markdown--><p>Using the rules of algebra we can invert the equation above to get the following expression.</p><!--kg-card-begin: markdown--><p><img src=\"https://user-images.githubusercontent.com/33357428/210628024-cf038674-9241-4076-9422-1e45b6d1199e.jpg\" alt=\"SymptomDisease2\" loading=\"lazy\"></p>\n<!--kg-card-end: markdown--><p>Here’s the problem, if we interpret the first equation as <em>diseases cause symptoms</em>, then we have to interpret the second equation as <em>symptoms cause diseases!</em> Which is of course not true.</p><p><em>Note : Linear relations are used here for illustration purposes only; they do not represent typical disease-symptom relations but illustrate the historical development of path analysis.</em></p><!--kg-card-begin: markdown--><h3 id=\"why-association-correlation-is-not-causation\">Why Association (Correlation) Is Not Causation ?</h3>\n<!--kg-card-end: markdown--><p>Before moving to the next set of blog posts , I should precisely define what a correlation is. I know you all are bored with listening this oft-repeated saying \"Correlation is not causation\" , so am i. So lets sort this out before moving to anything else!</p><p>Before moving ahead lets clarify one more thing : “Correlation” is often colloquially used as a synonym for statistical dependence. However, “correlation” is technically only a measure of linear statistical dependence. We will largely be using the term association to refer to statistical dependence from now on.</p><p>Lets take an example from Brady Neal's Causal Course book.</p><p>Say you happen upon some data that relates wearing shoes to bed and<br>waking up with a headache, as one does. It turns out that most times<br>that someone wears shoes to bed, that person wakes up with a headache.<br>And most times someone doesn’t wear shoes to bed, that person doesn’t<br>wake up with a headache. It is not uncommon for people to interpret<br>data like this (with associations) as meaning that <em>wearing shoes to bed<br>causes people to wake up with headaches</em>, especially if they are looking<br>for a reason to justify not wearing shoes to bed.</p><p>We can explain <em>how wearing shoes to bed and headaches are associated<br>without either being a cause of the other</em>. I<em>t turns out that they are<br>both caused by a common cause: <u>drinking the night before.</u> </em>This kind of variables are called \"<strong><em>confounder\" </em></strong><em>or lurking variable. </em>We will call this kind of association confounding association since the association is facilitated by a confounder.</p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/210763817-267c4af8-6ac6-4392-8dcb-3196a94ed19a.jpg\" align=\"center\" alt=\"Headache And Shoes\" width=\"338\" height=\"285\"><!--kg-card-end: markdown--><p>The main problem motivating causal inference is that association is not causation.<br>If the two were the same, then causal inference would be easy. Traditional statistics and machine learning would already have causal inference solved, as measuring causation would be as simple as just looking at measures such as correlation and predictive performance in data.</p><p>Lets look at another example. Lets attempt to determine the causal effect of vitamin C intake on resistance to sickness. Let X be defined as a binary indicator representing if this subject intakes vitamin C and let Y be a binary indicator of being healthy (not getting sick). X is also referred to as the ‘treatment’ in a more general setting. Now, let C<sub>1</sub> be the value of Y if X=1 (vitamin C is taken) and C<sub>0</sub> be the value of Y if X=0 (vitamin C is not taken). We call C<sub>0</sub> and C<sub>1</sub> the potential outcomes of this experiment.</p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/210771012-3809777d-83cb-4027-a73c-495690ec5876.jpg\" align=\"center\" alt=\"Association And Causation\" width=\"567\" height=\"330\">\n<!--kg-card-end: markdown--><p>For a single person, the causal effect of taking vitamin C in this context would be the difference between the expected outcome of taking vitamin C and the expected outcome of not taking vitamin C.</p><!--kg-card-begin: markdown--><div class=\"kg-card kg-code-card gatsby-highlight\" data-language=\"text\"><pre class=\"language-text\"><code class=\"language-text\">Causal Effect = E(C1) – E(C0)\n</code></pre></div>\n<!--kg-card-end: markdown--><p>Unfortunately, we can only ever observe one of the possible outcomes C<sub>0</sub> or C<sub>1</sub>. We cannot perfectly reset all conditions to see the result of the opposite treatment. Instead, we can use multiple samples and calculate the association between Vitamin C and being healthy.</p><!--kg-card-begin: markdown--><div class=\"kg-card kg-code-card gatsby-highlight\" data-language=\"text\"><pre class=\"language-text\"><code class=\"language-text\">Association = E(Y|X=1) – E(Y|X=0)\n</code></pre></div>\n<!--kg-card-end: markdown--><p>Association as being   (1+1+1+1)/4 – (0+0+0+0)/4 = 1</p><p>Causal effect, using the unobserved outcomes*, as being (4*0 + 4*1)/4 – (4*0 + 4*1)/4 = 0</p><p>We just calculated that, in this case, <strong>association does not equal causation. </strong>Observationally, there seems to be a <em>perfect association between taking Vitamin C intake and being healthy. </em>However, we can see there is no causal effect because we are privileged with the values of the unobserved outcomes. <em>This inequality could be explained by considering that the people that stayed healthy practiced healthy habits which included taking Vitamin C.</em></p><p>Okay, one more motivating example :</p><p>In response to a large study that studied the relationship between income and life expectancy, Vox published an article titled <em><a href=\"https://www.vox.com/2016/4/13/11420230/life-expectancy-income.\">“Want to live longer, even if you’re poor? Then move to a big city in California” (Klein, 2016).</a></em> However, as is implied by the title of the study “The Association Between Income and Life Expectancy in the United States, 2001-2014”, the study did not presume to make this recommendation and in fact the closest statement made to the Vox recommendation was <strong><em>“… the strongest pattern in the data was that low-income individuals tend to live longest (and have more healthful behaviors) in cities with highly educated populations, high incomes, and high levels of government expenditures, such as New York, New York, and San Francisco, California.”</em></strong> (Chetty et al., 2016).</p><p>Similar to the example regarding vitamin C and health, this study only found associative effects. However, just like it is incorrect to say that vitamin C causes a person to be healthy, it is also incorrect to say that moving to California will cause you to live longer.</p><p>That's all for now ! , Hope you enjoyed reading so far !</p><p>In the next blog post, we will further investigate the differences between association and causation, by starting with Pearl’s three-level causal hierarchy. That will be much interesting to watch out for !</p><!--kg-card-begin: markdown--><h3 id=\"references\">References</h3>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><ol>\n<li><a href=\"https://ftp.cs.ucla.edu/pub/stat_ser/r414.pdf\">Simpson's Paradox - Technical Report By Judea Pearl</a></li>\n<li><a href=\"https://bayes.cs.ucla.edu/PRIMER/ch1-preview.pdf\">Causal Inferences In Statistics Judea Pearl - Chapter 1</a></li>\n<li><a href=\"https://www.bradyneal.com/causal-inference-course\">Course : Causal Course - Brady Neal</a></li>\n</ol>\n<!--kg-card-end: markdown-->","htmlAst":{"type":"root","children":[{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"This post is the second post of the series on "},{"type":"element","tagName":"a","properties":{"href":"/tag/causal-machine-learning"},"children":[{"type":"text","value":"Causal Machine Learning"}]},{"type":"text","value":". In the "},{"type":"element","tagName":"a","properties":{"href":"/introduction-to-causality-in-machine-learning/"},"children":[{"type":"text","value":"last blog post"}]},{"type":"text","value":" i have given a brief introduction on Causal Machine Learning. In this we will uncover the WHY !. This blog post is based on the work of Judea Pearl. I will discuss how naive statistics can fail (Spurious Correlations , Simpson Paradox & Asymmetry In Causal Inference). As always i will try to keep the things as simple as possible. So stay with me , Enjoy reading !"}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h3","properties":{"id":"how-naive-statistics-can-fail-us"},"children":[{"type":"text","value":"How (Naive) Statistics Can Fail Us !"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h4","properties":{"id":"1-spurious-correlations"},"children":[{"type":"text","value":"1. Spurious Correlations"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"blockquote","properties":{},"children":[{"type":"text","value":"\n"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Correllation Is Not Causation !!!"}]},{"type":"text","value":"\n"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"I know you are bored of this like every other, hearing this oft-repeated saying “correlation does not imply causation”. I will formally illustrate why this is in this post. "}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"In the website, "},{"type":"element","tagName":"a","properties":{"href":"http://www.tylervigen.com/spurious-correlations"},"children":[{"type":"text","value":"“Spurious Correlations”"}]},{"type":"text","value":" by Tyler Vigen, we can explore a wide variety of statistical correlations (that are due to chance) with no causal implications. One of them is listed below."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/210524097-c10421ba-b2f5-44fd-93c8-311a365e78c7.jpg","align":"center","alt":"Spurious Correllation","width":720,"height":360},"children":[]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"We clearly know No of people who drowned while in a swimming pool has nothing to do with the power generated by US nuclear power plants !!. You can find more interesting "},{"type":"element","tagName":"a","properties":{"href":"http://www.tylervigen.com/spurious-correlations"},"children":[{"type":"text","value":"“Spurious Correlations”"}]},{"type":"text","value":" from Tyler Vigen website."}]},{"type":"element","tagName":"p","properties":{},"children":[]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h4","properties":{"id":"2-simpsons-paradox"},"children":[{"type":"text","value":"2. Simpson's Paradox"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Spurious correlations are well-known in statistics, so it’s "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"easy (Somewhat !)"}]},{"type":"text","value":" to be on the lookout for it. Lets see a little known paradox in statistics. "}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"element","tagName":"strong","properties":{},"children":[{"type":"element","tagName":"a","properties":{"href":"https://en.wikipedia.org/wiki/Simpson%27s_paradox"},"children":[{"type":"text","value":"Simpson's paradox"}]}]},{"type":"text","value":" is a phenomenon in "},{"type":"element","tagName":"a","properties":{"href":"https://en.wikipedia.org/wiki/Probability"},"children":[{"type":"text","value":"probability"}]},{"type":"text","value":" and "},{"type":"element","tagName":"a","properties":{"href":"https://en.wikipedia.org/wiki/Statistics"},"children":[{"type":"text","value":"statistics"}]},{"type":"text","value":" in which a trend appears in several groups of data but disappears or reverses when the groups are combined. Lets look at an example of Simpson's paradox from Wikipedia itself."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/210533109-6f2a4803-ae3a-4cbb-82fa-38dd43b1a6a6.png","align":"center","alt":"Simpson's Paradox","width":719,"height":276},"children":[]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"In both 1995 and 1996, Justice had a higher batting average (in bold type) than Jeter did. However, when the two baseball seasons are combined, Jeter shows a higher batting average than Justice."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"blockquote","properties":{},"children":[{"type":"text","value":"\n"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Same data give contradictory conclusions depending on how you look at them! ."}]},{"type":"text","value":"\n"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Simpson’s paradox highlights that "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"how you look at your data matters."}]},{"type":"text","value":" So the question becomes, how do we partition data? Although there is no standard method in statistics for this, "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"causal inference provides a formalism to handle this problem"}]},{"type":"text","value":". It all boils down to causal effects, which quantify the impact a variable has on another variable after adjusting for the appropriate confounders. "}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Let us look at an example from “"},{"type":"element","tagName":"a","properties":{"href":"https://www.basicbooks.com/titles/judea-pearl/the-book-of-why/9780465097609/"},"children":[{"type":"text","value":"The Book of Why: The New Science of Cause and Effect"}]},{"type":"text","value":"” by Judea Pearl , and a post the legend himself posted in his "},{"type":"element","tagName":"a","properties":{"href":"https://twitter.com/yudapearl/status/1411842797376659457"},"children":[{"type":"text","value":"twitter handle."}]}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Consider the below study that measures weekly exercise and cholesterol in various age groups. When we plot exercise on the X-axis and cholesterol on the Y-axis and segregate by age, as in left side of Fig , we see that there is a general trend downward in each group; the more young people exercise, the lower their cholesterol is, and the same applies for middle-aged people and the elderly. If, however, we use the same scatter plot, but we don’t segregate by gender (as in right side of Fig), we see a general trend upward; the more a person exercises, the higher their cholesterol is. "}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/210539152-32dac61b-7722-41c8-8d1f-6e3fea1d33f3.jpg","align":"center","alt":"Book Of Why Simpson's Paradox","width":580,"height":362},"children":[]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"blockquote","properties":{},"children":[{"type":"text","value":"\n"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Excercise appears to be beneficial in each age group but harmful in the population as a whole !!!"}]},{"type":"text","value":"\n"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"To resolve this problem, we once again turn to the story behind the data. If we know that older people, who are more likely to exercise  are also more likely to have high cholesterol regardless of exercise, then the reversal is easily explained, and easily resolved. Age is a common cause of both treatment (exercise) and outcome (cholesterol). So we should look at the age-segregated data in order to compare same-age people and thereby eliminate the possibility that the high exercisers in each group we examine are more likely to have high cholesterol due to their age, and not due to exercising."}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"element","tagName":"em","properties":{},"children":[{"type":"element","tagName":"strong","properties":{},"children":[{"type":"text","value":"However, please do not get confused , segregated data does not always give the correct answer."}]}]}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Lets look at another example from the Causal Inference In Statistics book by Pearl. "}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"In the classical example used by Simpson (1951), a group of sick patients are given the option to try a new drug. Among those who took the drug, a lower percentage recovered than among those who did not. However, when we partition by gender, we see that more men taking the drug recover than do men are not taking the drug, and more women taking the drug recover than do women are not taking the drug! "}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/210541750-32d1eddf-be44-4dab-bea7-3246d893baad.jpeg","align":"center","alt":"Book Of Why Simpson's Paradox","width":720,"height":182},"children":[]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"element","tagName":"em","properties":{},"children":[{"type":"element","tagName":"strong","properties":{},"children":[{"type":"text","value":"In other words, the drug appears to help men and women, but hurt the general population. It seems nonsensical, or even impossible—which is why, of course, it is considered a paradox."}]}]},{"type":"text","value":" Some people find it hard to believe that numbers could even be combined in such a way. "}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"blockquote","properties":{},"children":[{"type":"text","value":"\n"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"The data seem to say that if we know the patient’s gender male or female we can prescribe the drug, but if the gender is unknown we should not! Obviously, that conclusion is ridiculous. If the drug helps men and women, it must help anyone; our lack of knowledge of the patient’s gender cannot make the drug harmful."}]},{"type":"text","value":"\n"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Given the results of this study, then, should a doctor prescribe the drug for a woman? A man? A patient of unknown gender? Or consider a policy maker who is evaluating the drug’s overall effectiveness on the population. Should he/she use the recovery rate for the general population? Or should he/she use the recovery rates for the gendered sub-populations?"}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"element","tagName":"strong","properties":{},"children":[{"type":"text","value":"The answer is nowhere to be found in simple statistics."}]}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"In order to decide whether the drug will harm or help a patient, "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"we first have to understand the story behind the data , the causal mechanism that led to, or generated, the results we see. "}]},{"type":"text","value":"For instance, suppose we knew an additional fact: "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"Estrogen has a negative effect on recovery, so women are less likely to recover than men, regardless of the drug."}]},{"type":"text","value":" In addition, as we can see from the data, women are significantly more likely to take the drug than men are. So, the reason the drug appears to be harmful overall is that, if we select a drug user at random, that person is more likely to be a woman and hence less likely to recover than a random person who does not take the drug. "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"Put differently, being a woman is a common cause of both drug taking and failure to recover. Therefore, to assess the effectiveness, we need to compare subjects of the same gender, thereby ensuring that any difference in recovery rates between those who take the drug and those who do not is not ascribable to estrogen."}]}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"In fact, as statistics textbooks have traditionally (and correctly) warned students, correlation is not causation, so there is no statistical method that can determine the causal story from the data alone. Consequently, there is no statistical method that can aid in our decision."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h4","properties":{"id":"3-symmetry"},"children":[{"type":"text","value":"3. Symmetry"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"The problems with traditional statistics when thinking about causality stems from a fundamental property of algebra, symmetry . The left-hand side of an equation equals the right-hand side (that’s the point of algebra). The equal sign implies symmetry. However, causality is fundamentally asymmetric i.e. causes lead to effects and not the other way around."}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"This distinction further implies that causal relations cannot be expressed in the language of probability and, hence, that any mathematical approach to causal analysis must acquire new notation – probability calculus is insufficient. To illustrate, the syntax of probability calculus does not permit us to express the simple fact that “symptoms do not cause diseases,” let alone draw mathematical conclusions from such facts. All we can say is that two events are dependent—meaning that if we find one, we can expect to encounter the other, but we cannot distinguish statistical dependence, quantified by the conditional probability "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"P(disease|symptom)"}]},{"type":"text","value":" from causal dependence, for which we have no expression in standard probability calculus."}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Let’s look at a simple example taken from Judea Pearl book itself . Suppose we model the relationship between a disease and the symptoms it produces, with the expression below. Y represents the severity of the symptoms, X the severity of the disease, m is the connection between the two, and b represents all other factors."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/210627825-c1e11271-a609-4bd7-b679-42dca0a40aee.jpg","alt":"SymptomsDisease","loading":"lazy"},"children":[]}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Using the rules of algebra we can invert the equation above to get the following expression."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/210628024-cf038674-9241-4076-9422-1e45b6d1199e.jpg","alt":"SymptomDisease2","loading":"lazy"},"children":[]}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Here’s the problem, if we interpret the first equation as "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"diseases cause symptoms"}]},{"type":"text","value":", then we have to interpret the second equation as "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"symptoms cause diseases!"}]},{"type":"text","value":" Which is of course not true."}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"Note : Linear relations are used here for illustration purposes only; they do not represent typical disease-symptom relations but illustrate the historical development of path analysis."}]}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h3","properties":{"id":"why-association-correlation-is-not-causation"},"children":[{"type":"text","value":"Why Association (Correlation) Is Not Causation ?"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Before moving to the next set of blog posts , I should precisely define what a correlation is. I know you all are bored with listening this oft-repeated saying \"Correlation is not causation\" , so am i. So lets sort this out before moving to anything else!"}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Before moving ahead lets clarify one more thing : “Correlation” is often colloquially used as a synonym for statistical dependence. However, “correlation” is technically only a measure of linear statistical dependence. We will largely be using the term association to refer to statistical dependence from now on."}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Lets take an example from Brady Neal's Causal Course book."}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Say you happen upon some data that relates wearing shoes to bed and"},{"type":"element","tagName":"br","properties":{},"children":[]},{"type":"text","value":"waking up with a headache, as one does. It turns out that most times"},{"type":"element","tagName":"br","properties":{},"children":[]},{"type":"text","value":"that someone wears shoes to bed, that person wakes up with a headache."},{"type":"element","tagName":"br","properties":{},"children":[]},{"type":"text","value":"And most times someone doesn’t wear shoes to bed, that person doesn’t"},{"type":"element","tagName":"br","properties":{},"children":[]},{"type":"text","value":"wake up with a headache. It is not uncommon for people to interpret"},{"type":"element","tagName":"br","properties":{},"children":[]},{"type":"text","value":"data like this (with associations) as meaning that "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"wearing shoes to bed"},{"type":"element","tagName":"br","properties":{},"children":[]},{"type":"text","value":"causes people to wake up with headaches"}]},{"type":"text","value":", especially if they are looking"},{"type":"element","tagName":"br","properties":{},"children":[]},{"type":"text","value":"for a reason to justify not wearing shoes to bed."}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"We can explain "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"how wearing shoes to bed and headaches are associated"},{"type":"element","tagName":"br","properties":{},"children":[]},{"type":"text","value":"without either being a cause of the other"}]},{"type":"text","value":". I"},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"t turns out that they are"},{"type":"element","tagName":"br","properties":{},"children":[]},{"type":"text","value":"both caused by a common cause: "},{"type":"element","tagName":"u","properties":{},"children":[{"type":"text","value":"drinking the night before."}]},{"type":"text","value":" "}]},{"type":"text","value":"This kind of variables are called \""},{"type":"element","tagName":"strong","properties":{},"children":[{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"confounder\" "}]}]},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"or lurking variable. "}]},{"type":"text","value":"We will call this kind of association confounding association since the association is facilitated by a confounder."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/210763817-267c4af8-6ac6-4392-8dcb-3196a94ed19a.jpg","align":"center","alt":"Headache And Shoes","width":338,"height":285},"children":[]},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"The main problem motivating causal inference is that association is not causation."},{"type":"element","tagName":"br","properties":{},"children":[]},{"type":"text","value":"If the two were the same, then causal inference would be easy. Traditional statistics and machine learning would already have causal inference solved, as measuring causation would be as simple as just looking at measures such as correlation and predictive performance in data."}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Lets look at another example. Lets attempt to determine the causal effect of vitamin C intake on resistance to sickness. Let X be defined as a binary indicator representing if this subject intakes vitamin C and let Y be a binary indicator of being healthy (not getting sick). X is also referred to as the ‘treatment’ in a more general setting. Now, let C"},{"type":"element","tagName":"sub","properties":{},"children":[{"type":"text","value":"1"}]},{"type":"text","value":" be the value of Y if X=1 (vitamin C is taken) and C"},{"type":"element","tagName":"sub","properties":{},"children":[{"type":"text","value":"0"}]},{"type":"text","value":" be the value of Y if X=0 (vitamin C is not taken). We call C"},{"type":"element","tagName":"sub","properties":{},"children":[{"type":"text","value":"0"}]},{"type":"text","value":" and C"},{"type":"element","tagName":"sub","properties":{},"children":[{"type":"text","value":"1"}]},{"type":"text","value":" the potential outcomes of this experiment."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/210771012-3809777d-83cb-4027-a73c-495690ec5876.jpg","align":"center","alt":"Association And Causation","width":567,"height":330},"children":[]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"For a single person, the causal effect of taking vitamin C in this context would be the difference between the expected outcome of taking vitamin C and the expected outcome of not taking vitamin C."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"div","properties":{"className":["kg-card","kg-code-card","gatsby-highlight"],"dataLanguage":"text"},"children":[{"type":"element","tagName":"pre","properties":{"className":["language-text"]},"children":[{"type":"element","tagName":"code","properties":{"className":["language-text"]},"children":[{"type":"text","value":"Causal Effect = E(C1) – E(C0)\n"}]}]}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Unfortunately, we can only ever observe one of the possible outcomes C"},{"type":"element","tagName":"sub","properties":{},"children":[{"type":"text","value":"0"}]},{"type":"text","value":" or C"},{"type":"element","tagName":"sub","properties":{},"children":[{"type":"text","value":"1"}]},{"type":"text","value":". We cannot perfectly reset all conditions to see the result of the opposite treatment. Instead, we can use multiple samples and calculate the association between Vitamin C and being healthy."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"div","properties":{"className":["kg-card","kg-code-card","gatsby-highlight"],"dataLanguage":"text"},"children":[{"type":"element","tagName":"pre","properties":{"className":["language-text"]},"children":[{"type":"element","tagName":"code","properties":{"className":["language-text"]},"children":[{"type":"text","value":"Association = E(Y|X=1) – E(Y|X=0)\n"}]}]}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Association as being   (1+1+1+1)/4 – (0+0+0+0)/4 = 1"}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Causal effect, using the unobserved outcomes*, as being (4*0 + 4*1)/4 – (4*0 + 4*1)/4 = 0"}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"We just calculated that, in this case, "},{"type":"element","tagName":"strong","properties":{},"children":[{"type":"text","value":"association does not equal causation. "}]},{"type":"text","value":"Observationally, there seems to be a "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"perfect association between taking Vitamin C intake and being healthy. "}]},{"type":"text","value":"However, we can see there is no causal effect because we are privileged with the values of the unobserved outcomes. "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"This inequality could be explained by considering that the people that stayed healthy practiced healthy habits which included taking Vitamin C."}]}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Okay, one more motivating example :"}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"In response to a large study that studied the relationship between income and life expectancy, Vox published an article titled "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"element","tagName":"a","properties":{"href":"https://www.vox.com/2016/4/13/11420230/life-expectancy-income."},"children":[{"type":"text","value":"“Want to live longer, even if you’re poor? Then move to a big city in California” (Klein, 2016)."}]}]},{"type":"text","value":" However, as is implied by the title of the study “The Association Between Income and Life Expectancy in the United States, 2001-2014”, the study did not presume to make this recommendation and in fact the closest statement made to the Vox recommendation was "},{"type":"element","tagName":"strong","properties":{},"children":[{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"“… the strongest pattern in the data was that low-income individuals tend to live longest (and have more healthful behaviors) in cities with highly educated populations, high incomes, and high levels of government expenditures, such as New York, New York, and San Francisco, California.”"}]}]},{"type":"text","value":" (Chetty et al., 2016)."}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Similar to the example regarding vitamin C and health, this study only found associative effects. However, just like it is incorrect to say that vitamin C causes a person to be healthy, it is also incorrect to say that moving to California will cause you to live longer."}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"That's all for now ! , Hope you enjoyed reading so far !"}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"In the next blog post, we will further investigate the differences between association and causation, by starting with Pearl’s three-level causal hierarchy. That will be much interesting to watch out for !"}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h3","properties":{"id":"references"},"children":[{"type":"text","value":"References"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"ol","properties":{},"children":[{"type":"text","value":"\n"},{"type":"element","tagName":"li","properties":{},"children":[{"type":"element","tagName":"a","properties":{"href":"https://ftp.cs.ucla.edu/pub/stat_ser/r414.pdf"},"children":[{"type":"text","value":"Simpson's Paradox - Technical Report By Judea Pearl"}]}]},{"type":"text","value":"\n"},{"type":"element","tagName":"li","properties":{},"children":[{"type":"element","tagName":"a","properties":{"href":"https://bayes.cs.ucla.edu/PRIMER/ch1-preview.pdf"},"children":[{"type":"text","value":"Causal Inferences In Statistics Judea Pearl - Chapter 1"}]}]},{"type":"text","value":"\n"},{"type":"element","tagName":"li","properties":{},"children":[{"type":"element","tagName":"a","properties":{"href":"https://www.bradyneal.com/causal-inference-course"},"children":[{"type":"text","value":"Course : Causal Course - Brady Neal"}]}]},{"type":"text","value":"\n"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"}],"data":{"quirksMode":false}},"tableOfContents":[{"id":"how-naive-statistics-can-fail-us","heading":"How (Naive) Statistics Can Fail Us !","items":[{"id":"1-spurious-correlations","heading":"1. Spurious Correlations"},{"id":"2-simpsons-paradox","heading":"2. Simpson's Paradox"},{"id":"3-symmetry","heading":"3. Symmetry"}]},{"id":"why-association-correlation-is-not-causation","heading":"Why Association (Correlation) Is Not Causation ?"},{"id":"references","heading":"References"}]},"featureImageSharp":{"base":"photo-1642341438078-af255e0f08a3.jpg","publicURL":"/static/eafce611a386b0cfa9c67c2382184079/photo-1642341438078-af255e0f08a3.jpg","imageMeta":{"width":2000,"height":1335},"childImageSharp":{"fluid":{"base64":"data:image/jpeg;base64,/9j/2wBDABALDA4MChAODQ4SERATGCgaGBYWGDEjJR0oOjM9PDkzODdASFxOQERXRTc4UG1RV19iZ2hnPk1xeXBkeFxlZ2P/2wBDARESEhgVGC8aGi9jQjhCY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2P/wgARCAANABQDASIAAhEBAxEB/8QAFgABAQEAAAAAAAAAAAAAAAAAAwAF/8QAFgEBAQEAAAAAAAAAAAAAAAAAAgAB/9oADAMBAAIQAxAAAAHLRwODHO//xAAbEAACAgMBAAAAAAAAAAAAAAAAAQMRAhITIf/aAAgBAQABBQKNauRMo5Iz9ds//8QAFxEBAAMAAAAAAAAAAAAAAAAAAAECEf/aAAgBAwEBPwGLNf/EABYRAQEBAAAAAAAAAAAAAAAAABEAAf/aAAgBAgEBPwFY2//EABkQAAIDAQAAAAAAAAAAAAAAAAEQABEhMf/aAAgBAQAGPwLDBZvH1f/EABkQAQADAQEAAAAAAAAAAAAAAAEAESExQf/aAAgBAQABPyHRNTmQ/UKcg68gOkC9rJR7P//aAAwDAQACAAMAAAAQ6C//xAAWEQADAAAAAAAAAAAAAAAAAAABEBH/2gAIAQMBAT8QBF//xAAWEQADAAAAAAAAAAAAAAAAAAAAARH/2gAIAQIBAT8QhqIP/8QAGxABAQADAQEBAAAAAAAAAAAAAREAITFRQWH/2gAIAQEAAT8QFG5a7p44LOIXSgfv0wInW8bGb3bTN34wyhL0+5//2Q==","aspectRatio":1.4957264957264957,"src":"/static/eafce611a386b0cfa9c67c2382184079/ea4ab/photo-1642341438078-af255e0f08a3.jpg","srcSet":"/static/eafce611a386b0cfa9c67c2382184079/477ba/photo-1642341438078-af255e0f08a3.jpg 175w,\n/static/eafce611a386b0cfa9c67c2382184079/06776/photo-1642341438078-af255e0f08a3.jpg 350w,\n/static/eafce611a386b0cfa9c67c2382184079/ea4ab/photo-1642341438078-af255e0f08a3.jpg 700w,\n/static/eafce611a386b0cfa9c67c2382184079/3055e/photo-1642341438078-af255e0f08a3.jpg 1050w,\n/static/eafce611a386b0cfa9c67c2382184079/eff08/photo-1642341438078-af255e0f08a3.jpg 1400w,\n/static/eafce611a386b0cfa9c67c2382184079/4e5f3/photo-1642341438078-af255e0f08a3.jpg 2000w","srcWebp":"/static/eafce611a386b0cfa9c67c2382184079/89afa/photo-1642341438078-af255e0f08a3.webp","srcSetWebp":"/static/eafce611a386b0cfa9c67c2382184079/9fca7/photo-1642341438078-af255e0f08a3.webp 175w,\n/static/eafce611a386b0cfa9c67c2382184079/37a4e/photo-1642341438078-af255e0f08a3.webp 350w,\n/static/eafce611a386b0cfa9c67c2382184079/89afa/photo-1642341438078-af255e0f08a3.webp 700w,\n/static/eafce611a386b0cfa9c67c2382184079/78e7a/photo-1642341438078-af255e0f08a3.webp 1050w,\n/static/eafce611a386b0cfa9c67c2382184079/03d34/photo-1642341438078-af255e0f08a3.webp 1400w,\n/static/eafce611a386b0cfa9c67c2382184079/49d6b/photo-1642341438078-af255e0f08a3.webp 2000w","sizes":"(max-width: 700px) 100vw, 700px"}}}}},{"node":{"id":"Ghost__Post__639aeedafbcf61465c0f70e5","title":"Introduction to Causality In Machine Learning - Part 1","slug":"introduction-to-causality-in-machine-learning","featured":true,"feature_image":"https://images.unsplash.com/photo-1485827404703-89b55fcc595e?crop=entropy&cs=tinysrgb&fit=max&fm=jpg&ixid=MnwxMTc3M3wwfDF8c2VhcmNofDExfHxtYWNoaW5lJTIwbGVhcm5pbmd8ZW58MHx8fHwxNjcyMTIxMjUz&ixlib=rb-4.0.3&q=80&w=2000","excerpt":"This post is the first of the series on Causal Machine Learning. I will start with the very basics of causal inference in this. Enjoy the reading!","custom_excerpt":"This post is the first of the series on Causal Machine Learning. I will start with the very basics of causal inference in this. Enjoy the reading!","visibility":"public","created_at_pretty":"15 Dec 2022","published_at_pretty":"15 Dec 2022","updated_at_pretty":"5 Jan 2023","created_at":"2022-12-15T15:24:34.000+05:30","published_at":"2022-12-15T15:29:30.000+05:30","updated_at":"2023-01-05T17:34:08.000+05:30","meta_title":null,"meta_description":null,"og_description":null,"og_image":null,"og_title":null,"twitter_description":null,"twitter_image":null,"twitter_title":null,"authors":[{"slug":"amaljith","url":"http://localhost:2368/author/amaljith/","name":"Amaljith","bio":"Research Scholar @ IIT Kharagpur","cover_image":null,"profile_image":"http://localhost:2368/content/images/2022/09/Screenshot-from-2022-09-07-18-00-00.png","location":null,"website":null,"twitter":null,"facebook":null,"meta_title":null,"meta_description":null,"coverImageSharp":null,"profileImageSharp":null}],"primary_author":{"slug":"amaljith","url":"http://localhost:2368/author/amaljith/","name":"Amaljith","bio":"Research Scholar @ IIT Kharagpur","cover_image":null,"profile_image":"http://localhost:2368/content/images/2022/09/Screenshot-from-2022-09-07-18-00-00.png","location":null,"website":null,"twitter":null,"facebook":null,"meta_title":null,"meta_description":null,"coverImageSharp":null,"profileImageSharp":{"base":"Screenshot-from-2022-09-07-18-00-00.png","publicURL":"/static/28e31bfedd96b4afe90237d2c1f700c3/Screenshot-from-2022-09-07-18-00-00.png","imageMeta":{"width":316,"height":237},"childImageSharp":{"fluid":{"base64":"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAAPCAYAAADkmO9VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAEJElEQVQ4yyXO21fTBQDA8d9fUh5LQAW1eEImbIPJbSB3kZtijjSBxNvAGSAqlJZ64GAHIQ0VjkKICCiCQ8cYsYFjYwhsYxeGjJsopmEnH76dYw+f94/w2jmKfaSX9wtTrC1a+bBk5d83dlbnzFhHehnq62TBZeLjGyfvFqf4a9nGstuAx/oM96Qai76NieFu5u0jLLrNCCsOI47nat56J1hbtLH60kTfg5tUV6hQFeSSFLGTgtwcBtUteGc0zLoGmXNpmXU8Yd6txTOlxvq8hzmr/hNhyTaC06hmya3nn6VxHKMPKS08RFF+HgWK/aiO5JP3TQ4xMgn3mqt599bC8pyBRY+OxRkdXkc/rrE+POZ+nMY+hFXHKPPWP5m3a/i4Yqax9kfCREEkyKPJSk3mQFYmOelpRISGEBMuwahv4/WC4dNuwa3l1Us906O9jGs7sWg7EHQdv6NurWVmvJv33iHKTuSyft06tm3xJygwEIlIxDb/zUhFIvw2+HDrxkX+XjUx5x7A6+pn3qXBZnyIoacFTftNhKarlTxprsMz3sPbWR2jmiYiJSICt27lqwB/tmza+L8Af5KkYsYNHXhcvbidahy2x0ya2jFqm7lxqYxrF04jvLQMYB16jKazHrOulRWHjiJFOus/+5yAjX5s8vNhs58vfj5foooO43LJMaovncI1/ZhpWxcTlk6mRjrpa7uOrqsJYcaiY25imKftv6F52IBx4D61Z46y6Yv1+PpsYLOvD36+G0gJDuRcrIS0nTvY8XUAl66osDq60A3e5sHtXzCoW3CY+hHeeKdZdL5gytDDgsOAe1zL9KiawowEsoK2URi+HWWMmKvZuzibKuNcqoTiWBEXShToLHfo7K6h+9413DYzS/MehJnJYZxjOowDHdjHnjE53M2kUU1/zVmu742lOlNOkyKRmtzd1KkUtObFcacgmUetVTT0/kTDvQpemPoxDWmoralG8NrNvJqZwmbS8Xp5FrfNhMtuxtNei75Kia3xAmtdVZgaf6axvprKvXH8kB5FXd1lFMcyuVp7Hnd9E0+bb5Cdvgfhw4oXPq5hGR5kzGzkUVcPVeWlPK8tx/BrCfcrjnI+bx+KPSmcPJxP2QklFUolRfmHyf/uEEXK49y9XMndaxdJjpQirCy4WLCPoe38A4Nez8XySjISU5CFhCIO3k5wUDASkZiUuHj2Z2ahyN5HvuIgRUeOcUZZTOlJJeUlKgbU7Txta0AYbLnFqewMrpSepvhIITm700nblYg8IpJIWRjyiAjiI6JJjJKTGBtHcnwCqYlJpCelkJOWwfe533Jw337Kik9g1N5HqC9RcfyAgvTUPcRHyYmWypCFiJGKQggNDkYsEiEJEhEmCiE8REyYWIJMKiVCEoY8fCcJUTHEx8QgCw0l/8Be/gOTiTiUD46AHAAAAABJRU5ErkJggg==","aspectRatio":1.3333333333333333,"src":"/static/28e31bfedd96b4afe90237d2c1f700c3/6ccb0/Screenshot-from-2022-09-07-18-00-00.png","srcSet":"/static/28e31bfedd96b4afe90237d2c1f700c3/7d89d/Screenshot-from-2022-09-07-18-00-00.png 28w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/f4091/Screenshot-from-2022-09-07-18-00-00.png 55w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/6ccb0/Screenshot-from-2022-09-07-18-00-00.png 110w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/30481/Screenshot-from-2022-09-07-18-00-00.png 165w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/33bd6/Screenshot-from-2022-09-07-18-00-00.png 220w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/d398b/Screenshot-from-2022-09-07-18-00-00.png 316w","srcWebp":"/static/28e31bfedd96b4afe90237d2c1f700c3/8678c/Screenshot-from-2022-09-07-18-00-00.webp","srcSetWebp":"/static/28e31bfedd96b4afe90237d2c1f700c3/59cda/Screenshot-from-2022-09-07-18-00-00.webp 28w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/7da75/Screenshot-from-2022-09-07-18-00-00.webp 55w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/8678c/Screenshot-from-2022-09-07-18-00-00.webp 110w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/f282e/Screenshot-from-2022-09-07-18-00-00.webp 165w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/a7b21/Screenshot-from-2022-09-07-18-00-00.webp 220w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/fb2b8/Screenshot-from-2022-09-07-18-00-00.webp 316w","sizes":"(max-width: 110px) 100vw, 110px"}}}},"primary_tag":{"slug":"machine-learning","url":"http://localhost:2368/tag/machine-learning/","name":"Machine Learning","visibility":"public","feature_image":null,"description":null,"meta_title":null,"meta_description":null,"featureImageSharp":null},"tags":[{"slug":"machine-learning","url":"http://localhost:2368/tag/machine-learning/","name":"Machine Learning","visibility":"public","feature_image":null,"description":null,"meta_title":null,"meta_description":null,"featureImageSharp":null},{"slug":"artificial-intelligence","url":"http://localhost:2368/tag/artificial-intelligence/","name":"Artificial Intelligence","visibility":"public","feature_image":null,"description":null,"meta_title":null,"meta_description":null,"featureImageSharp":null},{"slug":"causal-machine-learning","url":"http://localhost:2368/tag/causal-machine-learning/","name":"Causal Machine Learning","visibility":"public","feature_image":null,"description":null,"meta_title":null,"meta_description":null,"featureImageSharp":null}],"plaintext":"\n\n\nTo Build Truly Intelligent Machines, Teach Them Cause and Effect.\n\n\n\n\n\n\nCorrelation is not Causation!.\n\n\n\nThis post is the first of the series on Causal Machine Learning. I will start with the very basics of causal inference , Provide some basic background in Bayesian networks/graphical models , Show how graphical models can be used in causal inference and Describe application scenarios , case studies and the practical difficulties in Causal Inference. I will follow step by step approach and keep the things as simple as possible with lot of practical tutorials. In each of the session i will provide additional learning materials and references , which should help you to keep the flow. In this blog post , i will present Why we need causality in Machine Learning! . Enjoy the reading!\n\n\nWhat is Causal AI ?\n\n\nDeep learning techniques do a good job at building models by correlating data points. But many AI researchers believe that more work needs to be done to understand causation and not just correlation. The field causal deep learning -- useful in determining why something happened -- is still in its infancy, and it is much more difficult to automate than neural networks.\n\n\n\n\nEven though we can observe correlation, it does not prove causation.\n\n\n\nCorrelation: measures the relationship between two things.\nCausation: means that one thing will cause the other to happen.\n\n\n\n * \n   \n   \n   I know that when I see X, I will see Y (Association/Correlation)\n   \n   \n\n * \n   \n   \n   I know that X causes Y (Causality)\n   \n   \n\n\nThe distinctions between the two can have important implications. In the website, “Spurious Correlations” by Tyler Vigen, we can explore a wide variety of correlations that are due to chance. One of the interesting spurious correlation is listed below.\n\n\n\n\nWhile these two variables have a correlation of 95.23%, it is highly unrealistic to think that a degree in mathematics caused an increase in the amount of Uranium stored in the United States!\n\n\n\n\n\n\n\n“If Correlation Doesn’t Imply Causation, Then What Does?”\n\n\n\nWe will uncover this in the upcoming blog posts.\n\nThe excitement in the field has been kindled by Judea Pearl, a professor at UCLA, who did some of the formative work on implementing Bayesian networks for statistical analysis. More recently he has been developing a framework for diagramming causation and teasing apart the factors that contribute to observed events in a computable framework.\n\n\n\n\nIn his latest book, “The Book of Why: The New Science of Cause and Effect,” he argues that artificial intelligence has been handicapped by an incomplete understanding of what intelligence really is. In his new book, Pearl, elaborates a vision for how truly intelligent machines would think. The key, he argues, is to replace reasoning by association with causal reasoning. Instead of the mere ability to correlate fever and malaria, machines need the capacity to reason that malaria causes fever. Pearl expects that causal reasoning could provide machines with human-level intelligence.\n\n\n\n\nUnderstanding why requires understanding of the whats, the wheres, and the whens. The hows, however, seem to be an implementation of the whys !!\n\n\nTraditional ML Vs Causal ML\n\n\nMachine learning is a type of artificial intelligence that involves training algorithms to recognise patterns in data and make predictions or decisions based on those patterns. The goal of machine learning is to build models that can generalise to new data and make accurate predictions or decisions.\n\nThe main difference between machine learning and causal machine learning is the focus of the analysis. While traditional machine learning aims to predict an outcome based on patterns in the data, causal machine learning aims to identify the specific variables that are causing an outcome and how they contribute to it. This involves identifying and modelling the causal relationships between variables, rather than just the statistical relationships.\n\nCausal machine learning is a type of machine learning that focuses on identifying the cause-and-effect relationships between variables. In other words, it aims to identify the factors that influence a particular outcome and how they influence it.\n\nCausal machine learning can be used to improve the accuracy of predictive models by taking into account the underlying causes of the outcomes being predicted. It is also useful in a variety of applications, such as evaluating the effectiveness of interventions, predicting the impact of policy changes, and identifying the factors that contribute to certain outcomes.\n\n\nExplainable AI vs Causal AI\n\n\nExplainable AI (XAI) is a type of artificial intelligence (AI) that is designed to be transparent and interpretable, meaning that it can provide explanations for its decisions and actions. The goal of XAI is to make AI systems more transparent and understandable to humans, so that they can be trusted and used more effectively.\n\nCausal AI, on the other hand, is a type of AI that focuses on identifying the cause-and-effect relationships between variables. It is a sub field of machine learning that aims to identify the specific factors that influence a particular outcome and how they contribute to it. One key difference between explainable AI and causal AI is their focus. Explainable AI is concerned with making AI systems more transparent and interpretable, while causal AI is focused on identifying the causes and effects of various phenomena.\n\nAnother difference is the techniques used. Explainable AI often employs techniques such as feature importance, sensitivity analysis, and model interpretation methods to provide insights into the decision-making process of AI systems. Causal AI, on the other hand, typically uses techniques such as structural equation modelling, instrumental variables, and counterfactual analysis to identify the causal relationships between variables.\n\nWhile XAI and causal AI are related in that they both aim to increase our understanding of how AI systems work, they are distinct concepts. XAI focuses on providing explanations for the decisions and predictions made by AI systems, while causal AI focuses on identifying the underlying causes of outcomes.\n\nOne way in which XAI and causal AI can be related is that XAI methods, such as counterfactual analysis, can be used to identify the causes of particular outcomes or decisions. However, it is important to note that not all XAI methods are causal in nature, and not all causal AI systems are necessarily explainable. Overall, both XAI and causal AI are important tools for improving our understanding of how AI systems work and for increasing the transparency and trustworthiness of AI systems.\n\n\nApplications\n\n\nCausal machine learning can be applied in many different fields and industries to identify and understand the causal relationships between variables. Here are a few examples of the application of causal machine learning:\n\n 1. Healthcare: Causal machine learning can be used to identify the factors that contribute to the development of certain diseases and to predict the likelihood of future events, such as hospital re-admissions. It can also be used to understand the factors that influence patient outcomes and to develop personalised treatment plans.\n 2. Finance: Causal machine learning can be used to understand the factors that influence stock prices and to develop trading strategies. It can also be used to identify the causes of financial fraud and to develop strategies to prevent it.\n 3. Marketing: Causal machine learning can be used to understand the factors that influence customer behaviour and to optimize marketing campaigns. It can also be used to identify the factors that drive customer loyalty and to develop strategies to retain customers.\n 4. Education: Causal machine learning can be used to understand the factors that influence student learning and to develop personalized learning strategies. It can also be used to identify the causes of academic achievement gaps and to develop strategies to address them.\n 5. Social science: Causal machine learning can be used to understand the factors that influence social outcomes and to develop policies that address social issues. It can also be used to identify the causes of social inequality and to develop strategies to address it.\n\nThese are just a few examples of the many ways in which causal machine learning can be applied. There are many more applications in various fields and industries.\n\n\nConclusion\n\n\nIn this post I have provided an introduction to Causal machine learning. I have tried to be as compact as possible. Below you can find some additional resources if you want to know more about Causal machine learning. Hope you enjoyed reading so far!.\n\n\nReferences\n\n\n 1. \n    \n    \n    Textbook : Causal Machine Learning - Mannings\n    \n\n 2. \n    \n    \n    Paper : Causal Machine Learning: A Survey and Open Problems, 2022. Jean Kaddour, Aengus Lynch, Qi Liu, Matt J. Kusner, Ricardo Silva.\n    \n\n 3. \n    \n    \n    Workshop : NeurIPS 2021 Workshop\n    \n\n 4. \n    \n    \n    Course : Machine Learning & Causal Inference: A Short Course\n    \n\n 5. \n    \n    \n    Texbook : Causal inference in statistics:An overview. Judea Pearl\n    \n\n 6. \n    \n    \n    Industry : Causality and Machine Learning: Microsoft Research\n    \n","html":"<!--kg-card-begin: markdown--><blockquote>\n<p>To Build Truly Intelligent Machines, Teach Them Cause and Effect.</p>\n</blockquote>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><blockquote>\n<p>Correlation is not Causation!.</p>\n</blockquote>\n<!--kg-card-end: markdown--><p>This post is the first of the series on <a href=\"http://localhost:2368/tag/causal-machine-learning\">Causal Machine Learning</a>. I will start with the very basics of causal inference , Provide some basic background in Bayesian networks/graphical models , Show how graphical models can be used in causal inference and Describe application scenarios , case studies and the practical difficulties in Causal Inference. I will follow step by step approach and keep the things as simple as possible with lot of practical tutorials. In each of the session i will provide additional learning materials and references , which should help you to keep the flow. In this blog post , i will present <em>Why we need causality in Machine Learning! </em>. Enjoy the reading!</p><!--kg-card-begin: markdown--><h3 id=\"what-is-causal-ai\">What is Causal AI ?</h3>\n<!--kg-card-end: markdown--><p>Deep learning techniques do a good job at building models by correlating data points. But many AI researchers believe that more work needs to be done to understand causation and not just correlation. The field causal deep learning -- useful in determining <em>why something happened </em>-- is still in its infancy, and it is much more difficult to automate than neural networks.</p><!--kg-card-begin: markdown--><blockquote>\n<p>Even though we can observe correlation, it does not prove causation.</p>\n</blockquote>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><pre><code>Correlation: measures the relationship between two things.\nCausation: means that one thing will cause the other to happen.\n</code></pre>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><ul>\n<li>\n<pre><code>I know that when I see X, I will see Y (Association/Correlation)\n</code></pre>\n</li>\n<li>\n<pre><code>I know that X causes Y (Causality)\n</code></pre>\n</li>\n</ul>\n<!--kg-card-end: markdown--><p>The distinctions between the two can have important implications. In the website, <a href=\"http://www.tylervigen.com/spurious-correlations\">“Spurious Correlations”</a> by Tyler Vigen, we can explore a wide variety of correlations that are due to chance. One of the interesting spurious correlation is listed below. </p><!--kg-card-begin: markdown--><p><img src=\"https://user-images.githubusercontent.com/33357428/209831536-c820c96b-3347-4a30-b0f0-dca9e409e6f0.jpeg\" align=\"center\"\nalt=\"Spurious Correlation Example\" width = \"720\" height=\"283\"></p>\n<!--kg-card-end: markdown--><p>While these two variables have a correlation of 95.23%, <em>it is highly unrealistic to think that a degree in mathematics caused an increase in the amount of Uranium stored in the United States!</em></p><!--kg-card-begin: markdown--><p><img src=\"https://user-images.githubusercontent.com/33357428/208087796-ad8b5727-8db3-4d71-9881-8a4f56169964.png\" align=\"center\"\nalt=\"CorrelationVsCausation\"></p>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><blockquote>\n<p>“If Correlation Doesn’t Imply Causation, Then What Does?”</p>\n</blockquote>\n<!--kg-card-end: markdown--><p>We will uncover this in the upcoming blog posts.</p><p>The excitement in the field has been kindled by Judea Pearl, a professor at UCLA, who did some of the formative work on implementing <a href=\"https://www.techtarget.com/searchenterpriseai/feature/Bayesian-networks-applications-are-fueling-enterprise-support\">Bayesian networks for statistical analysis</a>. More recently he has been developing a framework for diagramming causation and teasing apart the factors that contribute to observed events in a computable framework. </p><!--kg-card-begin: markdown--><p><img src=\"https://user-images.githubusercontent.com/33357428/208084630-fd654f0e-0bc2-4576-844a-5d398c75d5a8.jpg\" align=\"center\"\nalt=\"The Book Of Why\" width = \"600\" height=\"400\"></p>\n<!--kg-card-end: markdown--><p>In his latest book, “<a href=\"https://www.basicbooks.com/titles/judea-pearl/the-book-of-why/9780465097609/\">The Book of Why: The New Science of Cause and Effect</a>,” he argues that artificial intelligence has been handicapped by an incomplete understanding of what intelligence really is. In his new book, Pearl, elaborates a vision for how truly intelligent machines would think. The key, he argues, is to replace reasoning by association with causal reasoning. <em>Instead of the mere ability to correlate fever and malaria, machines need the capacity to reason that malaria causes fever. </em>Pearl expects that causal reasoning could provide machines with human-level intelligence.</p><!--kg-card-begin: markdown--><p><img src=\"https://user-images.githubusercontent.com/33357428/208090292-28654bde-487b-4f4e-b1ea-dd957c8ae7f0.jpg\" align=\"center\" alt=\"The Why ?\" height=\n\"400\"/></p>\n<!--kg-card-end: markdown--><p>Understanding <em>why</em> requires understanding of the <em>whats, the wheres, and the whens.</em> The <em>hows</em>, however, seem to be an implementation of the <em>whys !!</em></p><!--kg-card-begin: markdown--><h3 id=\"traditional-ml-vs-causal-ml\">Traditional ML Vs Causal ML</h3>\n<!--kg-card-end: markdown--><p>Machine learning is a type of artificial intelligence that involves training algorithms to recognise patterns in data and make predictions or decisions based on those patterns. The goal of machine learning is to build models that can generalise to new data and make accurate predictions or decisions.</p><p>The main difference between machine learning and causal machine learning is the focus of the analysis. While traditional machine learning aims to predict an outcome based on patterns in the data, causal machine learning aims to identify the specific variables that are causing an outcome and how they contribute to it. This involves identifying and modelling the causal relationships between variables, rather than just the statistical relationships.</p><p>Causal machine learning is a type of machine learning that focuses on identifying the cause-and-effect relationships between variables. In other words, it aims to identify the factors that influence a particular outcome and how they influence it.</p><p>Causal machine learning can be used to improve the accuracy of predictive models by taking into account the underlying causes of the outcomes being predicted. It is also useful in a variety of applications, such as evaluating the effectiveness of interventions, predicting the impact of policy changes, and identifying the factors that contribute to certain outcomes.</p><!--kg-card-begin: markdown--><h3 id=\"explainable-ai-vs-causal-ai\">Explainable AI vs Causal AI</h3>\n<!--kg-card-end: markdown--><p><em>Explainable AI (XAI)</em> is a type of artificial intelligence (AI) that is designed to be transparent and interpretable, meaning that it can provide explanations for its decisions and actions. The goal of XAI is to make AI systems more transparent and understandable to humans, so that they can be trusted and used more effectively.</p><p>Causal AI, on the other hand, is a type of AI that focuses on identifying the cause-and-effect relationships between variables. It is a sub field of machine learning that aims to identify the specific factors that influence a particular outcome and how they contribute to it. One key difference between explainable AI and causal AI is their focus. Explainable AI is concerned with making AI systems more transparent and interpretable, while causal AI is focused on identifying the causes and effects of various phenomena.</p><p>Another difference is the techniques used. Explainable AI often employs techniques such as feature importance, sensitivity analysis, and model interpretation methods to provide insights into the decision-making process of AI systems. Causal AI, on the other hand, typically uses techniques such as structural equation modelling, instrumental variables, and counterfactual analysis to identify the causal relationships between variables.</p><p>While XAI and causal AI are related in that they both aim to increase our understanding of how AI systems work, they are distinct concepts. XAI focuses on providing explanations for the decisions and predictions made by AI systems, while causal AI focuses on identifying the underlying causes of outcomes.</p><p>One way in which XAI and causal AI can be related is that XAI methods, such as counterfactual analysis, can be used to identify the causes of particular outcomes or decisions. However, it is important to note that not all XAI methods are causal in nature, and not all causal AI systems are necessarily explainable. Overall, both XAI and causal AI are important tools for improving our understanding of how AI systems work and for increasing the transparency and trustworthiness of AI systems.</p><!--kg-card-begin: markdown--><h3 id=\"applications\">Applications</h3>\n<!--kg-card-end: markdown--><p>Causal machine learning can be applied in many different fields and industries to identify and understand the causal relationships between variables. Here are a few examples of the application of causal machine learning:</p><ol><li>Healthcare: Causal machine learning can be used to identify the factors that contribute to the development of certain diseases and to predict the likelihood of future events, such as hospital re-admissions. It can also be used to understand the factors that influence patient outcomes and to develop personalised treatment plans.</li><li>Finance: Causal machine learning can be used to understand the factors that influence stock prices and to develop trading strategies. It can also be used to identify the causes of financial fraud and to develop strategies to prevent it.</li><li>Marketing: Causal machine learning can be used to understand the factors that influence customer behaviour and to optimize marketing campaigns. It can also be used to identify the factors that drive customer loyalty and to develop strategies to retain customers.</li><li>Education: Causal machine learning can be used to understand the factors that influence student learning and to develop personalized learning strategies. It can also be used to identify the causes of academic achievement gaps and to develop strategies to address them.</li><li>Social science: Causal machine learning can be used to understand the factors that influence social outcomes and to develop policies that address social issues. It can also be used to identify the causes of social inequality and to develop strategies to address it.</li></ol><p>These are just a few examples of the many ways in which causal machine learning can be applied. There are many more applications in various fields and industries.</p><!--kg-card-begin: markdown--><h3 id=\"conclusion\">Conclusion</h3>\n<!--kg-card-end: markdown--><p>In this post I have provided an introduction to Causal machine learning. I have tried to be as compact as possible. Below you can find some additional resources if you want to know more about Causal machine learning. Hope you enjoyed reading so far!. </p><!--kg-card-begin: markdown--><h3 id=\"references\">References</h3>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><ol>\n<li>\n<p><a href=\"https://www.manning.com/books/causal-machine-learning\">Textbook : Causal Machine Learning - Mannings</a></p>\n</li>\n<li>\n<p><a href=\"https://arxiv.org/abs/2206.15475\">Paper : Causal Machine Learning: A Survey and Open Problems, 2022. Jean Kaddour, Aengus Lynch, Qi Liu, Matt J. Kusner, Ricardo Silva.</a></p>\n</li>\n<li>\n<p><a href=\"https://why21.causalai.net\">Workshop : NeurIPS 2021 Workshop</a></p>\n</li>\n<li>\n<p><a href=\"https://www.youtube.com/playlist?list=PLxq_lXOUlvQAoWZEqhRqHNezS30lI49G-\">Course : Machine Learning &amp; Causal Inference: A Short Course</a></p>\n</li>\n<li>\n<p><a href=\"https://ftp.cs.ucla.edu/pub/stat_ser/r350.pdf\">Texbook : Causal inference in statistics:An overview. Judea Pearl</a></p>\n</li>\n<li>\n<p><a href=\"https://www.microsoft.com/en-us/research/group/causal-inference/#!publications\">Industry : Causality and Machine Learning: Microsoft Research</a></p>\n</li>\n</ol>\n<!--kg-card-end: markdown-->","url":"http://localhost:2368/introduction-to-causality-in-machine-learning/","canonical_url":null,"uuid":"5011289e-f32c-4f7d-8d67-158b64251de8","codeinjection_foot":null,"codeinjection_head":null,"codeinjection_styles":null,"comment_id":"639aeedafbcf61465c0f70e5","reading_time":6,"send_email_when_published":null,"email_subject":null,"childHtmlRehype":{"html":"<!--kg-card-begin: markdown--><blockquote>\n<p>To Build Truly Intelligent Machines, Teach Them Cause and Effect.</p>\n</blockquote>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><blockquote>\n<p>Correlation is not Causation!.</p>\n</blockquote>\n<!--kg-card-end: markdown--><p>This post is the first of the series on <a href=\"/tag/causal-machine-learning\">Causal Machine Learning</a>. I will start with the very basics of causal inference , Provide some basic background in Bayesian networks/graphical models , Show how graphical models can be used in causal inference and Describe application scenarios , case studies and the practical difficulties in Causal Inference. I will follow step by step approach and keep the things as simple as possible with lot of practical tutorials. In each of the session i will provide additional learning materials and references , which should help you to keep the flow. In this blog post , i will present <em>Why we need causality in Machine Learning! </em>. Enjoy the reading!</p><!--kg-card-begin: markdown--><h3 id=\"what-is-causal-ai\">What is Causal AI ?</h3>\n<!--kg-card-end: markdown--><p>Deep learning techniques do a good job at building models by correlating data points. But many AI researchers believe that more work needs to be done to understand causation and not just correlation. The field causal deep learning -- useful in determining <em>why something happened </em>-- is still in its infancy, and it is much more difficult to automate than neural networks.</p><!--kg-card-begin: markdown--><blockquote>\n<p>Even though we can observe correlation, it does not prove causation.</p>\n</blockquote>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><div class=\"kg-card kg-code-card gatsby-highlight\" data-language=\"text\"><pre class=\"language-text\"><code class=\"language-text\">Correlation: measures the relationship between two things.\nCausation: means that one thing will cause the other to happen.\n</code></pre></div>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><ul>\n<li>\n<div class=\"kg-card kg-code-card gatsby-highlight\" data-language=\"text\"><pre class=\"language-text\"><code class=\"language-text\">I know that when I see X, I will see Y (Association/Correlation)\n</code></pre></div>\n</li>\n<li>\n<div class=\"kg-card kg-code-card gatsby-highlight\" data-language=\"text\"><pre class=\"language-text\"><code class=\"language-text\">I know that X causes Y (Causality)\n</code></pre></div>\n</li>\n</ul>\n<!--kg-card-end: markdown--><p>The distinctions between the two can have important implications. In the website, <a href=\"http://www.tylervigen.com/spurious-correlations\">“Spurious Correlations”</a> by Tyler Vigen, we can explore a wide variety of correlations that are due to chance. One of the interesting spurious correlation is listed below. </p><!--kg-card-begin: markdown--><p><img src=\"https://user-images.githubusercontent.com/33357428/209831536-c820c96b-3347-4a30-b0f0-dca9e409e6f0.jpeg\" align=\"center\" alt=\"Spurious Correlation Example\" width=\"720\" height=\"283\"></p>\n<!--kg-card-end: markdown--><p>While these two variables have a correlation of 95.23%, <em>it is highly unrealistic to think that a degree in mathematics caused an increase in the amount of Uranium stored in the United States!</em></p><!--kg-card-begin: markdown--><p><img src=\"https://user-images.githubusercontent.com/33357428/208087796-ad8b5727-8db3-4d71-9881-8a4f56169964.png\" align=\"center\" alt=\"CorrelationVsCausation\"></p>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><blockquote>\n<p>“If Correlation Doesn’t Imply Causation, Then What Does?”</p>\n</blockquote>\n<!--kg-card-end: markdown--><p>We will uncover this in the upcoming blog posts.</p><p>The excitement in the field has been kindled by Judea Pearl, a professor at UCLA, who did some of the formative work on implementing <a href=\"https://www.techtarget.com/searchenterpriseai/feature/Bayesian-networks-applications-are-fueling-enterprise-support\">Bayesian networks for statistical analysis</a>. More recently he has been developing a framework for diagramming causation and teasing apart the factors that contribute to observed events in a computable framework. </p><!--kg-card-begin: markdown--><p><img src=\"https://user-images.githubusercontent.com/33357428/208084630-fd654f0e-0bc2-4576-844a-5d398c75d5a8.jpg\" align=\"center\" alt=\"The Book Of Why\" width=\"600\" height=\"400\"></p>\n<!--kg-card-end: markdown--><p>In his latest book, “<a href=\"https://www.basicbooks.com/titles/judea-pearl/the-book-of-why/9780465097609/\">The Book of Why: The New Science of Cause and Effect</a>,” he argues that artificial intelligence has been handicapped by an incomplete understanding of what intelligence really is. In his new book, Pearl, elaborates a vision for how truly intelligent machines would think. The key, he argues, is to replace reasoning by association with causal reasoning. <em>Instead of the mere ability to correlate fever and malaria, machines need the capacity to reason that malaria causes fever. </em>Pearl expects that causal reasoning could provide machines with human-level intelligence.</p><!--kg-card-begin: markdown--><p><img src=\"https://user-images.githubusercontent.com/33357428/208090292-28654bde-487b-4f4e-b1ea-dd957c8ae7f0.jpg\" align=\"center\" alt=\"The Why ?\" height=\"400\"></p>\n<!--kg-card-end: markdown--><p>Understanding <em>why</em> requires understanding of the <em>whats, the wheres, and the whens.</em> The <em>hows</em>, however, seem to be an implementation of the <em>whys !!</em></p><!--kg-card-begin: markdown--><h3 id=\"traditional-ml-vs-causal-ml\">Traditional ML Vs Causal ML</h3>\n<!--kg-card-end: markdown--><p>Machine learning is a type of artificial intelligence that involves training algorithms to recognise patterns in data and make predictions or decisions based on those patterns. The goal of machine learning is to build models that can generalise to new data and make accurate predictions or decisions.</p><p>The main difference between machine learning and causal machine learning is the focus of the analysis. While traditional machine learning aims to predict an outcome based on patterns in the data, causal machine learning aims to identify the specific variables that are causing an outcome and how they contribute to it. This involves identifying and modelling the causal relationships between variables, rather than just the statistical relationships.</p><p>Causal machine learning is a type of machine learning that focuses on identifying the cause-and-effect relationships between variables. In other words, it aims to identify the factors that influence a particular outcome and how they influence it.</p><p>Causal machine learning can be used to improve the accuracy of predictive models by taking into account the underlying causes of the outcomes being predicted. It is also useful in a variety of applications, such as evaluating the effectiveness of interventions, predicting the impact of policy changes, and identifying the factors that contribute to certain outcomes.</p><!--kg-card-begin: markdown--><h3 id=\"explainable-ai-vs-causal-ai\">Explainable AI vs Causal AI</h3>\n<!--kg-card-end: markdown--><p><em>Explainable AI (XAI)</em> is a type of artificial intelligence (AI) that is designed to be transparent and interpretable, meaning that it can provide explanations for its decisions and actions. The goal of XAI is to make AI systems more transparent and understandable to humans, so that they can be trusted and used more effectively.</p><p>Causal AI, on the other hand, is a type of AI that focuses on identifying the cause-and-effect relationships between variables. It is a sub field of machine learning that aims to identify the specific factors that influence a particular outcome and how they contribute to it. One key difference between explainable AI and causal AI is their focus. Explainable AI is concerned with making AI systems more transparent and interpretable, while causal AI is focused on identifying the causes and effects of various phenomena.</p><p>Another difference is the techniques used. Explainable AI often employs techniques such as feature importance, sensitivity analysis, and model interpretation methods to provide insights into the decision-making process of AI systems. Causal AI, on the other hand, typically uses techniques such as structural equation modelling, instrumental variables, and counterfactual analysis to identify the causal relationships between variables.</p><p>While XAI and causal AI are related in that they both aim to increase our understanding of how AI systems work, they are distinct concepts. XAI focuses on providing explanations for the decisions and predictions made by AI systems, while causal AI focuses on identifying the underlying causes of outcomes.</p><p>One way in which XAI and causal AI can be related is that XAI methods, such as counterfactual analysis, can be used to identify the causes of particular outcomes or decisions. However, it is important to note that not all XAI methods are causal in nature, and not all causal AI systems are necessarily explainable. Overall, both XAI and causal AI are important tools for improving our understanding of how AI systems work and for increasing the transparency and trustworthiness of AI systems.</p><!--kg-card-begin: markdown--><h3 id=\"applications\">Applications</h3>\n<!--kg-card-end: markdown--><p>Causal machine learning can be applied in many different fields and industries to identify and understand the causal relationships between variables. Here are a few examples of the application of causal machine learning:</p><ol><li>Healthcare: Causal machine learning can be used to identify the factors that contribute to the development of certain diseases and to predict the likelihood of future events, such as hospital re-admissions. It can also be used to understand the factors that influence patient outcomes and to develop personalised treatment plans.</li><li>Finance: Causal machine learning can be used to understand the factors that influence stock prices and to develop trading strategies. It can also be used to identify the causes of financial fraud and to develop strategies to prevent it.</li><li>Marketing: Causal machine learning can be used to understand the factors that influence customer behaviour and to optimize marketing campaigns. It can also be used to identify the factors that drive customer loyalty and to develop strategies to retain customers.</li><li>Education: Causal machine learning can be used to understand the factors that influence student learning and to develop personalized learning strategies. It can also be used to identify the causes of academic achievement gaps and to develop strategies to address them.</li><li>Social science: Causal machine learning can be used to understand the factors that influence social outcomes and to develop policies that address social issues. It can also be used to identify the causes of social inequality and to develop strategies to address it.</li></ol><p>These are just a few examples of the many ways in which causal machine learning can be applied. There are many more applications in various fields and industries.</p><!--kg-card-begin: markdown--><h3 id=\"conclusion\">Conclusion</h3>\n<!--kg-card-end: markdown--><p>In this post I have provided an introduction to Causal machine learning. I have tried to be as compact as possible. Below you can find some additional resources if you want to know more about Causal machine learning. Hope you enjoyed reading so far!. </p><!--kg-card-begin: markdown--><h3 id=\"references\">References</h3>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><ol>\n<li>\n<p><a href=\"https://www.manning.com/books/causal-machine-learning\">Textbook : Causal Machine Learning - Mannings</a></p>\n</li>\n<li>\n<p><a href=\"https://arxiv.org/abs/2206.15475\">Paper : Causal Machine Learning: A Survey and Open Problems, 2022. Jean Kaddour, Aengus Lynch, Qi Liu, Matt J. Kusner, Ricardo Silva.</a></p>\n</li>\n<li>\n<p><a href=\"https://why21.causalai.net\">Workshop : NeurIPS 2021 Workshop</a></p>\n</li>\n<li>\n<p><a href=\"https://www.youtube.com/playlist?list=PLxq_lXOUlvQAoWZEqhRqHNezS30lI49G-\">Course : Machine Learning &#x26; Causal Inference: A Short Course</a></p>\n</li>\n<li>\n<p><a href=\"https://ftp.cs.ucla.edu/pub/stat_ser/r350.pdf\">Texbook : Causal inference in statistics:An overview. Judea Pearl</a></p>\n</li>\n<li>\n<p><a href=\"https://www.microsoft.com/en-us/research/group/causal-inference/#!publications\">Industry : Causality and Machine Learning: Microsoft Research</a></p>\n</li>\n</ol>\n<!--kg-card-end: markdown-->","htmlAst":{"type":"root","children":[{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"blockquote","properties":{},"children":[{"type":"text","value":"\n"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"To Build Truly Intelligent Machines, Teach Them Cause and Effect."}]},{"type":"text","value":"\n"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"blockquote","properties":{},"children":[{"type":"text","value":"\n"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Correlation is not Causation!."}]},{"type":"text","value":"\n"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"This post is the first of the series on "},{"type":"element","tagName":"a","properties":{"href":"/tag/causal-machine-learning"},"children":[{"type":"text","value":"Causal Machine Learning"}]},{"type":"text","value":". I will start with the very basics of causal inference , Provide some basic background in Bayesian networks/graphical models , Show how graphical models can be used in causal inference and Describe application scenarios , case studies and the practical difficulties in Causal Inference. I will follow step by step approach and keep the things as simple as possible with lot of practical tutorials. In each of the session i will provide additional learning materials and references , which should help you to keep the flow. In this blog post , i will present "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"Why we need causality in Machine Learning! "}]},{"type":"text","value":". Enjoy the reading!"}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h3","properties":{"id":"what-is-causal-ai"},"children":[{"type":"text","value":"What is Causal AI ?"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Deep learning techniques do a good job at building models by correlating data points. But many AI researchers believe that more work needs to be done to understand causation and not just correlation. The field causal deep learning -- useful in determining "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"why something happened "}]},{"type":"text","value":"-- is still in its infancy, and it is much more difficult to automate than neural networks."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"blockquote","properties":{},"children":[{"type":"text","value":"\n"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Even though we can observe correlation, it does not prove causation."}]},{"type":"text","value":"\n"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"div","properties":{"className":["kg-card","kg-code-card","gatsby-highlight"],"dataLanguage":"text"},"children":[{"type":"element","tagName":"pre","properties":{"className":["language-text"]},"children":[{"type":"element","tagName":"code","properties":{"className":["language-text"]},"children":[{"type":"text","value":"Correlation: measures the relationship between two things.\nCausation: means that one thing will cause the other to happen.\n"}]}]}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"ul","properties":{},"children":[{"type":"text","value":"\n"},{"type":"element","tagName":"li","properties":{},"children":[{"type":"text","value":"\n"},{"type":"element","tagName":"div","properties":{"className":["kg-card","kg-code-card","gatsby-highlight"],"dataLanguage":"text"},"children":[{"type":"element","tagName":"pre","properties":{"className":["language-text"]},"children":[{"type":"element","tagName":"code","properties":{"className":["language-text"]},"children":[{"type":"text","value":"I know that when I see X, I will see Y (Association/Correlation)\n"}]}]}]},{"type":"text","value":"\n"}]},{"type":"text","value":"\n"},{"type":"element","tagName":"li","properties":{},"children":[{"type":"text","value":"\n"},{"type":"element","tagName":"div","properties":{"className":["kg-card","kg-code-card","gatsby-highlight"],"dataLanguage":"text"},"children":[{"type":"element","tagName":"pre","properties":{"className":["language-text"]},"children":[{"type":"element","tagName":"code","properties":{"className":["language-text"]},"children":[{"type":"text","value":"I know that X causes Y (Causality)\n"}]}]}]},{"type":"text","value":"\n"}]},{"type":"text","value":"\n"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"The distinctions between the two can have important implications. In the website, "},{"type":"element","tagName":"a","properties":{"href":"http://www.tylervigen.com/spurious-correlations"},"children":[{"type":"text","value":"“Spurious Correlations”"}]},{"type":"text","value":" by Tyler Vigen, we can explore a wide variety of correlations that are due to chance. One of the interesting spurious correlation is listed below. "}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/209831536-c820c96b-3347-4a30-b0f0-dca9e409e6f0.jpeg","align":"center","alt":"Spurious Correlation Example","width":720,"height":283},"children":[]}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"While these two variables have a correlation of 95.23%, "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"it is highly unrealistic to think that a degree in mathematics caused an increase in the amount of Uranium stored in the United States!"}]}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/208087796-ad8b5727-8db3-4d71-9881-8a4f56169964.png","align":"center","alt":"CorrelationVsCausation"},"children":[]}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"blockquote","properties":{},"children":[{"type":"text","value":"\n"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"“If Correlation Doesn’t Imply Causation, Then What Does?”"}]},{"type":"text","value":"\n"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"We will uncover this in the upcoming blog posts."}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"The excitement in the field has been kindled by Judea Pearl, a professor at UCLA, who did some of the formative work on implementing "},{"type":"element","tagName":"a","properties":{"href":"https://www.techtarget.com/searchenterpriseai/feature/Bayesian-networks-applications-are-fueling-enterprise-support"},"children":[{"type":"text","value":"Bayesian networks for statistical analysis"}]},{"type":"text","value":". More recently he has been developing a framework for diagramming causation and teasing apart the factors that contribute to observed events in a computable framework. "}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/208084630-fd654f0e-0bc2-4576-844a-5d398c75d5a8.jpg","align":"center","alt":"The Book Of Why","width":600,"height":400},"children":[]}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"In his latest book, “"},{"type":"element","tagName":"a","properties":{"href":"https://www.basicbooks.com/titles/judea-pearl/the-book-of-why/9780465097609/"},"children":[{"type":"text","value":"The Book of Why: The New Science of Cause and Effect"}]},{"type":"text","value":",” he argues that artificial intelligence has been handicapped by an incomplete understanding of what intelligence really is. In his new book, Pearl, elaborates a vision for how truly intelligent machines would think. The key, he argues, is to replace reasoning by association with causal reasoning. "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"Instead of the mere ability to correlate fever and malaria, machines need the capacity to reason that malaria causes fever. "}]},{"type":"text","value":"Pearl expects that causal reasoning could provide machines with human-level intelligence."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/208090292-28654bde-487b-4f4e-b1ea-dd957c8ae7f0.jpg","align":"center","alt":"The Why ?","height":400},"children":[]}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Understanding "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"why"}]},{"type":"text","value":" requires understanding of the "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"whats, the wheres, and the whens."}]},{"type":"text","value":" The "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"hows"}]},{"type":"text","value":", however, seem to be an implementation of the "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"whys !!"}]}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h3","properties":{"id":"traditional-ml-vs-causal-ml"},"children":[{"type":"text","value":"Traditional ML Vs Causal ML"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Machine learning is a type of artificial intelligence that involves training algorithms to recognise patterns in data and make predictions or decisions based on those patterns. The goal of machine learning is to build models that can generalise to new data and make accurate predictions or decisions."}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"The main difference between machine learning and causal machine learning is the focus of the analysis. While traditional machine learning aims to predict an outcome based on patterns in the data, causal machine learning aims to identify the specific variables that are causing an outcome and how they contribute to it. This involves identifying and modelling the causal relationships between variables, rather than just the statistical relationships."}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Causal machine learning is a type of machine learning that focuses on identifying the cause-and-effect relationships between variables. In other words, it aims to identify the factors that influence a particular outcome and how they influence it."}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Causal machine learning can be used to improve the accuracy of predictive models by taking into account the underlying causes of the outcomes being predicted. It is also useful in a variety of applications, such as evaluating the effectiveness of interventions, predicting the impact of policy changes, and identifying the factors that contribute to certain outcomes."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h3","properties":{"id":"explainable-ai-vs-causal-ai"},"children":[{"type":"text","value":"Explainable AI vs Causal AI"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"Explainable AI (XAI)"}]},{"type":"text","value":" is a type of artificial intelligence (AI) that is designed to be transparent and interpretable, meaning that it can provide explanations for its decisions and actions. The goal of XAI is to make AI systems more transparent and understandable to humans, so that they can be trusted and used more effectively."}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Causal AI, on the other hand, is a type of AI that focuses on identifying the cause-and-effect relationships between variables. It is a sub field of machine learning that aims to identify the specific factors that influence a particular outcome and how they contribute to it. One key difference between explainable AI and causal AI is their focus. Explainable AI is concerned with making AI systems more transparent and interpretable, while causal AI is focused on identifying the causes and effects of various phenomena."}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Another difference is the techniques used. Explainable AI often employs techniques such as feature importance, sensitivity analysis, and model interpretation methods to provide insights into the decision-making process of AI systems. Causal AI, on the other hand, typically uses techniques such as structural equation modelling, instrumental variables, and counterfactual analysis to identify the causal relationships between variables."}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"While XAI and causal AI are related in that they both aim to increase our understanding of how AI systems work, they are distinct concepts. XAI focuses on providing explanations for the decisions and predictions made by AI systems, while causal AI focuses on identifying the underlying causes of outcomes."}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"One way in which XAI and causal AI can be related is that XAI methods, such as counterfactual analysis, can be used to identify the causes of particular outcomes or decisions. However, it is important to note that not all XAI methods are causal in nature, and not all causal AI systems are necessarily explainable. Overall, both XAI and causal AI are important tools for improving our understanding of how AI systems work and for increasing the transparency and trustworthiness of AI systems."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h3","properties":{"id":"applications"},"children":[{"type":"text","value":"Applications"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Causal machine learning can be applied in many different fields and industries to identify and understand the causal relationships between variables. Here are a few examples of the application of causal machine learning:"}]},{"type":"element","tagName":"ol","properties":{},"children":[{"type":"element","tagName":"li","properties":{},"children":[{"type":"text","value":"Healthcare: Causal machine learning can be used to identify the factors that contribute to the development of certain diseases and to predict the likelihood of future events, such as hospital re-admissions. It can also be used to understand the factors that influence patient outcomes and to develop personalised treatment plans."}]},{"type":"element","tagName":"li","properties":{},"children":[{"type":"text","value":"Finance: Causal machine learning can be used to understand the factors that influence stock prices and to develop trading strategies. It can also be used to identify the causes of financial fraud and to develop strategies to prevent it."}]},{"type":"element","tagName":"li","properties":{},"children":[{"type":"text","value":"Marketing: Causal machine learning can be used to understand the factors that influence customer behaviour and to optimize marketing campaigns. It can also be used to identify the factors that drive customer loyalty and to develop strategies to retain customers."}]},{"type":"element","tagName":"li","properties":{},"children":[{"type":"text","value":"Education: Causal machine learning can be used to understand the factors that influence student learning and to develop personalized learning strategies. It can also be used to identify the causes of academic achievement gaps and to develop strategies to address them."}]},{"type":"element","tagName":"li","properties":{},"children":[{"type":"text","value":"Social science: Causal machine learning can be used to understand the factors that influence social outcomes and to develop policies that address social issues. It can also be used to identify the causes of social inequality and to develop strategies to address it."}]}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"These are just a few examples of the many ways in which causal machine learning can be applied. There are many more applications in various fields and industries."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h3","properties":{"id":"conclusion"},"children":[{"type":"text","value":"Conclusion"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"In this post I have provided an introduction to Causal machine learning. I have tried to be as compact as possible. Below you can find some additional resources if you want to know more about Causal machine learning. Hope you enjoyed reading so far!. "}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h3","properties":{"id":"references"},"children":[{"type":"text","value":"References"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"ol","properties":{},"children":[{"type":"text","value":"\n"},{"type":"element","tagName":"li","properties":{},"children":[{"type":"text","value":"\n"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"element","tagName":"a","properties":{"href":"https://www.manning.com/books/causal-machine-learning"},"children":[{"type":"text","value":"Textbook : Causal Machine Learning - Mannings"}]}]},{"type":"text","value":"\n"}]},{"type":"text","value":"\n"},{"type":"element","tagName":"li","properties":{},"children":[{"type":"text","value":"\n"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"element","tagName":"a","properties":{"href":"https://arxiv.org/abs/2206.15475"},"children":[{"type":"text","value":"Paper : Causal Machine Learning: A Survey and Open Problems, 2022. Jean Kaddour, Aengus Lynch, Qi Liu, Matt J. Kusner, Ricardo Silva."}]}]},{"type":"text","value":"\n"}]},{"type":"text","value":"\n"},{"type":"element","tagName":"li","properties":{},"children":[{"type":"text","value":"\n"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"element","tagName":"a","properties":{"href":"https://why21.causalai.net"},"children":[{"type":"text","value":"Workshop : NeurIPS 2021 Workshop"}]}]},{"type":"text","value":"\n"}]},{"type":"text","value":"\n"},{"type":"element","tagName":"li","properties":{},"children":[{"type":"text","value":"\n"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"element","tagName":"a","properties":{"href":"https://www.youtube.com/playlist?list=PLxq_lXOUlvQAoWZEqhRqHNezS30lI49G-"},"children":[{"type":"text","value":"Course : Machine Learning & Causal Inference: A Short Course"}]}]},{"type":"text","value":"\n"}]},{"type":"text","value":"\n"},{"type":"element","tagName":"li","properties":{},"children":[{"type":"text","value":"\n"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"element","tagName":"a","properties":{"href":"https://ftp.cs.ucla.edu/pub/stat_ser/r350.pdf"},"children":[{"type":"text","value":"Texbook : Causal inference in statistics:An overview. Judea Pearl"}]}]},{"type":"text","value":"\n"}]},{"type":"text","value":"\n"},{"type":"element","tagName":"li","properties":{},"children":[{"type":"text","value":"\n"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"element","tagName":"a","properties":{"href":"https://www.microsoft.com/en-us/research/group/causal-inference/#!publications"},"children":[{"type":"text","value":"Industry : Causality and Machine Learning: Microsoft Research"}]}]},{"type":"text","value":"\n"}]},{"type":"text","value":"\n"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"}],"data":{"quirksMode":false}},"tableOfContents":[{"id":"what-is-causal-ai","heading":"What is Causal AI ?"},{"id":"traditional-ml-vs-causal-ml","heading":"Traditional ML Vs Causal ML"},{"id":"explainable-ai-vs-causal-ai","heading":"Explainable AI vs Causal AI"},{"id":"applications","heading":"Applications"},{"id":"conclusion","heading":"Conclusion"},{"id":"references","heading":"References"}]},"featureImageSharp":{"base":"photo-1485827404703-89b55fcc595e.jpg","publicURL":"/static/c78765cdf66c1694378ee695d13229c5/photo-1485827404703-89b55fcc595e.jpg","imageMeta":{"width":2000,"height":1333},"childImageSharp":{"fluid":{"base64":"data:image/jpeg;base64,/9j/2wBDABALDA4MChAODQ4SERATGCgaGBYWGDEjJR0oOjM9PDkzODdASFxOQERXRTc4UG1RV19iZ2hnPk1xeXBkeFxlZ2P/2wBDARESEhgVGC8aGi9jQjhCY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2P/wgARCAANABQDASIAAhEBAxEB/8QAFQABAQAAAAAAAAAAAAAAAAAABQD/xAAVAQEBAAAAAAAAAAAAAAAAAAACAf/aAAwDAQACEAMQAAABMTNSNNnIv//EABkQAAIDAQAAAAAAAAAAAAAAAAEDAAIRE//aAAgBAQABBQIV2FdLoyKMZbilOFX/xAAVEQEBAAAAAAAAAAAAAAAAAAAAEf/aAAgBAwEBPwFX/8QAFBEBAAAAAAAAAAAAAAAAAAAAEP/aAAgBAgEBPwE//8QAHBAAAQQDAQAAAAAAAAAAAAAAAQACITEDEBES/9oACAEBAAY/AlUt2PAnImngpf/EABwQAAMAAQUAAAAAAAAAAAAAAAABESExUWGB0f/aAAgBAQABPyFSSclqoRceDzGPrBdOWo27Bm6z2H//2gAMAwEAAgADAAAAEPTP/8QAFxEAAwEAAAAAAAAAAAAAAAAAAAERUf/aAAgBAwEBPxBuQnD/xAAWEQEBAQAAAAAAAAAAAAAAAAABABH/2gAIAQIBAT8QDbG//8QAHRABAAMAAgMBAAAAAAAAAAAAAQARITFhQXGhwf/aAAgBAQABPxBE7CgOuJYFeFaVwPkKvqbtjgZaove/kG2kH+DauVJYvOpl/J//2Q==","aspectRatio":1.4957264957264957,"src":"/static/c78765cdf66c1694378ee695d13229c5/ea4ab/photo-1485827404703-89b55fcc595e.jpg","srcSet":"/static/c78765cdf66c1694378ee695d13229c5/477ba/photo-1485827404703-89b55fcc595e.jpg 175w,\n/static/c78765cdf66c1694378ee695d13229c5/06776/photo-1485827404703-89b55fcc595e.jpg 350w,\n/static/c78765cdf66c1694378ee695d13229c5/ea4ab/photo-1485827404703-89b55fcc595e.jpg 700w,\n/static/c78765cdf66c1694378ee695d13229c5/3055e/photo-1485827404703-89b55fcc595e.jpg 1050w,\n/static/c78765cdf66c1694378ee695d13229c5/eff08/photo-1485827404703-89b55fcc595e.jpg 1400w,\n/static/c78765cdf66c1694378ee695d13229c5/4e5f3/photo-1485827404703-89b55fcc595e.jpg 2000w","srcWebp":"/static/c78765cdf66c1694378ee695d13229c5/89afa/photo-1485827404703-89b55fcc595e.webp","srcSetWebp":"/static/c78765cdf66c1694378ee695d13229c5/9fca7/photo-1485827404703-89b55fcc595e.webp 175w,\n/static/c78765cdf66c1694378ee695d13229c5/37a4e/photo-1485827404703-89b55fcc595e.webp 350w,\n/static/c78765cdf66c1694378ee695d13229c5/89afa/photo-1485827404703-89b55fcc595e.webp 700w,\n/static/c78765cdf66c1694378ee695d13229c5/78e7a/photo-1485827404703-89b55fcc595e.webp 1050w,\n/static/c78765cdf66c1694378ee695d13229c5/03d34/photo-1485827404703-89b55fcc595e.webp 1400w,\n/static/c78765cdf66c1694378ee695d13229c5/49d6b/photo-1485827404703-89b55fcc595e.webp 2000w","sizes":"(max-width: 700px) 100vw, 700px"}}}}},{"node":{"id":"Ghost__Post__63ff26ec72a3c427182edd36","title":"Shapley Additive Explanations (SHAP)","slug":"shapley-additive-explanations-shap","featured":false,"feature_image":"https://images.unsplash.com/photo-1522069213448-443a614da9b6?crop=entropy&cs=tinysrgb&fit=max&fm=jpg&ixid=MnwxMTc3M3wwfDF8c2VhcmNofDR8fGdhbWUlMjB0aGVvcnl8ZW58MHx8fHwxNjc3NjY2MTQ5&ixlib=rb-4.0.3&q=80&w=2000","excerpt":"Shapley Additive Explanations (SHAP) is a game-theoretic approach to interpret the output of any machine learning model. It aims to explain the contribution of each feature to the final prediction of the model, thereby providing a clear understanding of how the model makes its decisions. ","custom_excerpt":"Shapley Additive Explanations (SHAP) is a game-theoretic approach to interpret the output of any machine learning model. It aims to explain the contribution of each feature to the final prediction of the model, thereby providing a clear understanding of how the model makes its decisions. ","visibility":"public","created_at_pretty":"1 Mar 2023","published_at_pretty":"1 Mar 2023","updated_at_pretty":"14 Oct 2024","created_at":"2023-03-01T15:50:28.000+05:30","published_at":"2023-03-01T16:06:46.000+05:30","updated_at":"2024-10-14T22:57:53.000+05:30","meta_title":null,"meta_description":null,"og_description":null,"og_image":null,"og_title":null,"twitter_description":null,"twitter_image":null,"twitter_title":null,"authors":[{"slug":"amaljith","url":"http://localhost:2368/author/amaljith/","name":"Amaljith","bio":"Research Scholar @ IIT Kharagpur","cover_image":null,"profile_image":"http://localhost:2368/content/images/2022/09/Screenshot-from-2022-09-07-18-00-00.png","location":null,"website":null,"twitter":null,"facebook":null,"meta_title":null,"meta_description":null,"coverImageSharp":null,"profileImageSharp":null}],"primary_author":{"slug":"amaljith","url":"http://localhost:2368/author/amaljith/","name":"Amaljith","bio":"Research Scholar @ IIT Kharagpur","cover_image":null,"profile_image":"http://localhost:2368/content/images/2022/09/Screenshot-from-2022-09-07-18-00-00.png","location":null,"website":null,"twitter":null,"facebook":null,"meta_title":null,"meta_description":null,"coverImageSharp":null,"profileImageSharp":{"base":"Screenshot-from-2022-09-07-18-00-00.png","publicURL":"/static/28e31bfedd96b4afe90237d2c1f700c3/Screenshot-from-2022-09-07-18-00-00.png","imageMeta":{"width":316,"height":237},"childImageSharp":{"fluid":{"base64":"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAAPCAYAAADkmO9VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAEJElEQVQ4yyXO21fTBQDA8d9fUh5LQAW1eEImbIPJbSB3kZtijjSBxNvAGSAqlJZ64GAHIQ0VjkKICCiCQ8cYsYFjYwhsYxeGjJsopmEnH76dYw+f94/w2jmKfaSX9wtTrC1a+bBk5d83dlbnzFhHehnq62TBZeLjGyfvFqf4a9nGstuAx/oM96Qai76NieFu5u0jLLrNCCsOI47nat56J1hbtLH60kTfg5tUV6hQFeSSFLGTgtwcBtUteGc0zLoGmXNpmXU8Yd6txTOlxvq8hzmr/hNhyTaC06hmya3nn6VxHKMPKS08RFF+HgWK/aiO5JP3TQ4xMgn3mqt599bC8pyBRY+OxRkdXkc/rrE+POZ+nMY+hFXHKPPWP5m3a/i4Yqax9kfCREEkyKPJSk3mQFYmOelpRISGEBMuwahv4/WC4dNuwa3l1Us906O9jGs7sWg7EHQdv6NurWVmvJv33iHKTuSyft06tm3xJygwEIlIxDb/zUhFIvw2+HDrxkX+XjUx5x7A6+pn3qXBZnyIoacFTftNhKarlTxprsMz3sPbWR2jmiYiJSICt27lqwB/tmza+L8Af5KkYsYNHXhcvbidahy2x0ya2jFqm7lxqYxrF04jvLQMYB16jKazHrOulRWHjiJFOus/+5yAjX5s8vNhs58vfj5foooO43LJMaovncI1/ZhpWxcTlk6mRjrpa7uOrqsJYcaiY25imKftv6F52IBx4D61Z46y6Yv1+PpsYLOvD36+G0gJDuRcrIS0nTvY8XUAl66osDq60A3e5sHtXzCoW3CY+hHeeKdZdL5gytDDgsOAe1zL9KiawowEsoK2URi+HWWMmKvZuzibKuNcqoTiWBEXShToLHfo7K6h+9413DYzS/MehJnJYZxjOowDHdjHnjE53M2kUU1/zVmu742lOlNOkyKRmtzd1KkUtObFcacgmUetVTT0/kTDvQpemPoxDWmoralG8NrNvJqZwmbS8Xp5FrfNhMtuxtNei75Kia3xAmtdVZgaf6axvprKvXH8kB5FXd1lFMcyuVp7Hnd9E0+bb5Cdvgfhw4oXPq5hGR5kzGzkUVcPVeWlPK8tx/BrCfcrjnI+bx+KPSmcPJxP2QklFUolRfmHyf/uEEXK49y9XMndaxdJjpQirCy4WLCPoe38A4Nez8XySjISU5CFhCIO3k5wUDASkZiUuHj2Z2ahyN5HvuIgRUeOcUZZTOlJJeUlKgbU7Txta0AYbLnFqewMrpSepvhIITm700nblYg8IpJIWRjyiAjiI6JJjJKTGBtHcnwCqYlJpCelkJOWwfe533Jw337Kik9g1N5HqC9RcfyAgvTUPcRHyYmWypCFiJGKQggNDkYsEiEJEhEmCiE8REyYWIJMKiVCEoY8fCcJUTHEx8QgCw0l/8Be/gOTiTiUD46AHAAAAABJRU5ErkJggg==","aspectRatio":1.3333333333333333,"src":"/static/28e31bfedd96b4afe90237d2c1f700c3/6ccb0/Screenshot-from-2022-09-07-18-00-00.png","srcSet":"/static/28e31bfedd96b4afe90237d2c1f700c3/7d89d/Screenshot-from-2022-09-07-18-00-00.png 28w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/f4091/Screenshot-from-2022-09-07-18-00-00.png 55w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/6ccb0/Screenshot-from-2022-09-07-18-00-00.png 110w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/30481/Screenshot-from-2022-09-07-18-00-00.png 165w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/33bd6/Screenshot-from-2022-09-07-18-00-00.png 220w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/d398b/Screenshot-from-2022-09-07-18-00-00.png 316w","srcWebp":"/static/28e31bfedd96b4afe90237d2c1f700c3/8678c/Screenshot-from-2022-09-07-18-00-00.webp","srcSetWebp":"/static/28e31bfedd96b4afe90237d2c1f700c3/59cda/Screenshot-from-2022-09-07-18-00-00.webp 28w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/7da75/Screenshot-from-2022-09-07-18-00-00.webp 55w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/8678c/Screenshot-from-2022-09-07-18-00-00.webp 110w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/f282e/Screenshot-from-2022-09-07-18-00-00.webp 165w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/a7b21/Screenshot-from-2022-09-07-18-00-00.webp 220w,\n/static/28e31bfedd96b4afe90237d2c1f700c3/fb2b8/Screenshot-from-2022-09-07-18-00-00.webp 316w","sizes":"(max-width: 110px) 100vw, 110px"}}}},"primary_tag":{"slug":"explainable-ai","url":"http://localhost:2368/tag/explainable-ai/","name":"Explainable AI","visibility":"public","feature_image":null,"description":null,"meta_title":null,"meta_description":null,"featureImageSharp":null},"tags":[{"slug":"explainable-ai","url":"http://localhost:2368/tag/explainable-ai/","name":"Explainable AI","visibility":"public","feature_image":null,"description":null,"meta_title":null,"meta_description":null,"featureImageSharp":null},{"slug":"artificial-intelligence","url":"http://localhost:2368/tag/artificial-intelligence/","name":"Artificial Intelligence","visibility":"public","feature_image":null,"description":null,"meta_title":null,"meta_description":null,"featureImageSharp":null},{"slug":"machine-learning","url":"http://localhost:2368/tag/machine-learning/","name":"Machine Learning","visibility":"public","feature_image":null,"description":null,"meta_title":null,"meta_description":null,"featureImageSharp":null}],"plaintext":"In this article , we will talk about SHAP in detail. We will discuss What are Shapley Values ? , Mathematical foundation behind the Shapley Values and How does SHAP (Shapley Additive Explanations) reframes the Shapey Value problem in detail. Also we will discuss What is Local Accuracy, Missingness, and Consistency in the context of explainable models , What is the Shapley Kernel etc.\n\n\nWhat is Shapley Values ?\n\n\nSHAP values were introduced  by Lundberg and Lee (2016) as a way to provide local explanations for machine learning models. They built upon Shapley's work in cooperative game theory, using the Shapley value to explain how each feature in a model contributes to the prediction for a particular instance.\n\nShapley values were invented by Lloyd Shapley as a way of providing a fair solution to the following question: if we have a coalition C that collaborates to produce a value V, how much did each individual member contribute to that final value ?\n\nSo what does this mean ? We have a coalition C, a group of cooperating members that work together to produce some value V, called the coalition value. This could be something like, a corporation of employees that together generate a certain profit, or a dinner group running up a restaurant bill. We want to know exactly how much each member contributed to that final coalition value; what share of the profit does each employee deserve, how much each person in the dinner party owes to settle the bill.\n\n\nHowever, answering this gets tricky when there are interacting effects between members, when certain permutations cause members to contribute more than the sum of their parts. To find a fair answer to this question that takes into account these interaction effects, we can compute the Shapley value for each member of the coalition.\n\nSo let’s how we can compute the Shapley value for member 1 of our example coalition. The way this is done is by sampling a coalition that contains member 1, and then looking at the coalition formed by removing that member.\n\n\nWe then look at the respective values of these two coalitions, and compare the difference between the two.\n\n\nThis difference is the marginal contribution of member 1 to the coalition consisting of members 2, 3, and 4; how much member 1 contributed to that specific group.\n\n\nSo we then enumerate all such pairs of coalitions, that is, all pairs of coalitions that only differ based on whether or not member 1 is included, and then look at all the marginal contributions for each.\n\n\nThe mean marginal contribution is the Shapley value of that member.\n\n\nWe can do this same process for each member of the coalition, and we’ve found a fair solution  to our original question.!\n\nMathematically, the whole process looks like this, but all we need is to know that the Shapley value is the average amount of contribution that a particular member makes to the coalition value.\n\n\nShapley Values To SHAP\n\n\nNow, translating this concept to model explainability is relatively straightforward, and that’s exactly what Scott Lundberg and Su-In Lee did in 2017 with their paper “A Unified Approach to Interpreting Model Predictions,” where they introduced SHAP.\n\n\nSHAP reframes the Shapley value problem from one where we look at how members of a coalition contribute to a coalition value to one where we look at how individual features contribute to a model’s outputs.\n\nThey do this in a very specific way, one that we can get a clue to by looking at the name of their algorithm; Shapley Additive Explanations. We know what Shapley values are, we know what Explanations are, but what do they mean by Additive?\n\n\nLundberg and Lee define an additive feature attribution as follows: if we have a set a of inputs x, and a model f(x), we can define a set of simplified local inputs x’ (which usually means that we turn a feature vector into a discrete binary vector, where features are either included or excluded) and we can also define an explanatory model g.\n\n\nWhat we need to ensure is that One: if x’ is roughly equal to x then g(x’) should be roughly equal to f(x)\n\ntwo: g must take below form, where phi_0 is the null output of the model, that is, the average output of the model, and phi_i is the explained effect of feature_i; how much that feature changes the output of the model. This is called it’s attribution.\n\nIf we have these two, we have an explanatory model that has additive feature attribution. The advantage of this form of explanation is really easy to interpret; we can see the exact contribution and importance of each feature just by looking at the phi values.\n\n\nLocal Accuracy, Missingness and Consistency\n\n\nNow Lundberg and Lee go on to describe a set of three desirable properties of such an additive feature method; local accuracy, missingness, and consistency.\n\n\nLocal Accuracy\n\n\nLocal accuracy : it simply says if the input and the simplified input are roughly the same, then the actual model and the explanatory model should produce roughly the same output.\n\nMissingness\n\n\nMissingness states that if a feature is excluded from the model, it’s attribution must be zero; that is, the only thing that can affect the output of the explanation model is the inclusion of features, not the exclusion.\n\nConsistency\n\n\nFinally, we have consistency (and this one’s a little hard to represent mathematically), but it states that if the original model changes so that the a particular feature’s contribution changes, the attribution in the explanatory model cannot change in the opposite direction;\n\nSo for example, if we have a new model where a specific feature has a more positive contribution than in the original; the attribution in our new explanatory model cannot decrease.\n\nNow while a bunch of different explanation methods satisfy some of these properties, Lundberg and Lee argue that only SHAP satisfies all three; if the feature attributions in our additive explanatory model are specifically chosen to be the shapley values of those features, then all three properties are upheld.\n\n\nShapley Kernel\n\n\nThe problem with SHAP, however, is that computing Shapley values means you have to sample the coalition values for each possible feature permutation, which in a model explainability setting means we have to evaluate our model that number of times.\n\nFor a model that operates over 4 features, it’s easy enough, it’s just 16 coalitions to sample to get all the Shapley values. For 32 features, that’s over 17 billion samples, which is entirely untenable. To get around this, Lundberg and Lee devise the Shapley Kernel, a means of approximating shapley values through much fewer samples.\n\nSo what we do is we pass samples through the model, of various feature permutations of the particular datapoint that we’re trying to explain.\n\nOf course, most ML models won’t just let you omit a feature !\n\n\nSo what we do is define a background dataset B, one that contains a set of representative datapoints that the model was trained over.\n\n\nWe then fill in our omitted feature or features with values from the background dataset, while holding the features that are included in the permutation fixed to their original values.\n\n\nWe then take the average of the model output over all of these new synthetic datapoints as our model output for that feature permutation, which we’ll call that y bar.\n\n\n\nSo once we have a number of samples computed in this way,\n\n\nWe can formulate this as a weighted linear regression, with each feature assigned a coefficient.\n\nWith a very specific choice of weighting for each sample, based on a combination of the total number of features in the model, the number of coalitions with the same number of features as this particular sample, and the number of features included and excluded in this permutation, we ensure that the solution to this weighted linear regression is such that the returned coefficients are equivalent to the Shapley values.\n\nThis weighting scheme is the basis of the Shapley Kernel, and the weighted linear regression process as a whole is Kernel SHAP.\n\nNow, there are a lot of other forms of SHAP that are presented in the paper (Deep SHAP, Low Order SHAP , Max SHAP , Linear SHAP , Tree SHAP etc) , ones that make use of model-specific assumptions and optimizations to speed up the algorithm and the sampling process, but Kernel SHAP is the one among them that is universal and can be applied to any type of machine learning model. This general applicability is why we chose Kernel SHAP as the first form of SHAP to implement for TrustyAI.\n\n\nKernel SHAP Example\n\n\nI’ll run through an example of the Python SHAP implementation provided by Lundberg and Lee. So first I’ll grab a dataset to run our example over, and I’ve picked the Boston housing price dataset, which is a dataset consisting of various attributes about Boston neighborhoods and the corresponding house prices within that neighborhood.\n\nIncomplete. Coming soon","html":"<p>In this article , we will talk about SHAP in detail. We will discuss What are Shapley Values ? , Mathematical foundation behind the Shapley Values and How does SHAP (Shapley Additive Explanations) reframes the Shapey Value problem in detail. Also we will discuss What is Local Accuracy, Missingness, and Consistency in the context of explainable models , What is the Shapley Kernel etc. </p><!--kg-card-begin: markdown--><h3 id=\"what-is-shapley-values\">What is Shapley Values ?</h3>\n<!--kg-card-end: markdown--><p>SHAP values were introduced  by <a href=\"https://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions.pdf\" rel=\"noopener ugc nofollow\">Lundberg and Lee (2016)</a> as a way to provide local explanations for machine learning models. They built upon Shapley's work in cooperative game theory, using the Shapley value to explain how each feature in a model contributes to the prediction for a particular instance.</p><blockquote class=\"kg-blockquote-alt\">Shapley values were invented by Lloyd Shapley as a way of providing a fair solution to the following question: <em>if we have a coalition <strong>C</strong> that collaborates to produce a value <strong>V</strong>, how much did each individual member contribute to that final value ?</em> </blockquote><p>So what does this mean ? We have a coalition C, a group of cooperating members that work together to produce some value V, called the coalition value. This could be something like, a corporation of employees that together generate a certain profit, or a dinner group running up a restaurant bill. We want to know exactly how much each member contributed to that final coalition value; what share of the profit does each employee deserve, how much each person in the dinner party owes to settle the bill.</p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222118086-efdfcda4-01e6-4d4d-bc6d-30ce019c464b.png\" align=\"center\" alt=\"Shapley Values\" width=\"720\" height=\"434\"/>\n<!--kg-card-end: markdown--><p>However, answering this gets tricky when there are interacting effects between members, when certain permutations cause members to contribute more than the sum of their parts. <strong><em>To find a fair answer to this question that takes into account these interaction effects, we can compute the Shapley value for each member of the coalition.</em></strong></p><p>So let’s how we can compute the Shapley value for member 1 of our example coalition. The way this is done is by <em>sampling a coalition that contains member 1, and then looking at the coalition formed by removing that member.</em> </p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222119980-a7c2c357-eecf-4ea9-8de6-7eefacf4e362.png\" align=\"center\" alt=\"Shapley Values\" width=\"720\" height=\"404\"/>\n<!--kg-card-end: markdown--><p>We then look at the respective values of these two coalitions, and compare the difference between the two.</p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222120650-7a9cd97e-3dfd-488c-9a24-45638a75847a.png\" align=\"center\" alt=\"Shapley Values\" width=\"720\" height=\"512\"/>\n<!--kg-card-end: markdown--><p><em>This difference is the marginal contribution of member 1 to the coalition consisting of members 2, 3, and 4; how much member 1 contributed to that specific group.</em></p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222120708-c27b48f7-7712-4e7d-ae44-c998a859215c.png\" align=\"center\" alt=\"Shapley Values\" width=\"720\" height=\"136\"/>\n<!--kg-card-end: markdown--><p>So we then enumerate all such pairs of coalitions, that is, all pairs of coalitions that only differ based on whether or not member 1 is included, and then look at all the marginal contributions for each. </p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222121573-e1c9b15d-d883-44f0-b6e3-41fff235ff06.png\" align=\"center\" alt=\"Shapley Values\" width=\"720\" height=\"333\"/>\n<!--kg-card-end: markdown--><p><em>The mean marginal contribution is the Shapley value of that member.</em></p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222121676-1e0e6852-0b66-44b7-bbd4-11fd8e25b39f.png\" align=\"center\" alt=\"Shapley Values\" width=\"720\" height=\"143\"/>\n<!--kg-card-end: markdown--><p>We can do this same process for each member of the coalition, and we’ve found a fair solution  to our original question.!</p><p>Mathematically, the whole process looks like this, but all we need is to know that the <strong><em>Shapley value is the average amount of contribution that a particular member makes to the coalition value.</em></strong></p><!--kg-card-begin: markdown--><h3 id=\"shapley-values-to-shap\">Shapley Values To SHAP</h3>\n<!--kg-card-end: markdown--><p>Now, translating this concept to model explainability is relatively straightforward, and that’s exactly what Scott Lundberg and Su-In Lee did in 2017 with their paper <a href=\"https://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions.pdf\">“A Unified Approach to Interpreting Model Predictions,”</a> where they introduced SHAP. </p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222123143-27853042-9e12-4526-91dd-466e04f8a68f.png\" align=\"center\" alt=\"Shapley Paper\" width=\"720\" height=\"666\"/>\n<!--kg-card-end: markdown--><p>SHAP reframes the Shapley value problem from one where we look at <em>how members of a coalition contribute to a coalition value</em> to one where we look at <em>how individual features contribute to a model’s outputs.</em></p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222123344-4acf9530-e696-4ee7-80f0-4835cd9cc775.png\" align=\"center\" alt=\"Shapley Paper\" width=\"580\" height=\"352\"/><!--kg-card-end: markdown--><!--kg-card-begin: markdown--><p>They do this in a very specific way, one that we can get a clue to by looking at the name of their algorithm; <span style=\"color:red\">Shapley</span> <span style=\"color:blue\">Additive</span> <span style=\"color:green\">Explanations</span>. We know what <span style=\"color:red\">Shapley</span> values are, we know what <span style=\"color:green\">Explanations</span> are, but what do they mean by <span style=\"color:blue\">Additive</span>?</p>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><p>Lundberg and Lee define an additive feature attribution as follows: if we have a set a of <span style=\"color:blue\">inputs x</span>, and <span style=\"color:blue\">a model f(x)</span>, we can define a set of simplified <span style=\"color:green\">local inputs x’</span> (which usually means that we turn a feature vector into a discrete binary vector, where features are either included or excluded) and we can also define an <span style=\"color:green\">explanatory model g</span>.</p>\n<!--kg-card-end: markdown--><p>What we need to ensure is that One: if x’ is roughly equal to x then g(x’) should be roughly equal to f(x)</p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222126320-96c62ae5-2dcd-46eb-8a64-38727338c7dd.png\" align=\"center\" alt=\"SHAP\" width=\"404\" height=\"222\"/><!--kg-card-end: markdown--><p>two: g must take below form, where <em>phi_0 is the null output of the model</em>, that is, the average output of the model, and <em>phi_i is the explained effect of feature_i</em>; how much that feature changes the output of the model. <em>This is called it’s attribution.</em></p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222126643-5e4a7d98-53a6-4389-86cd-ac6d37041150.png\" align=\"center\" alt=\"SHAP\" width=\"625\" height=\"261\"/><!--kg-card-end: markdown--><p>If we have these two, we have an explanatory model that has additive feature attribution. The advantage of this form of explanation is really easy to interpret; we can see the exact contribution and importance of each feature just by looking at the phi values.</p><!--kg-card-begin: markdown--><h3 id=\"local-accuracy-missingness-and-consistency\">Local Accuracy, Missingness and Consistency</h3>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><p>Now Lundberg and Lee go on to describe a set of three desirable properties of such an additive feature method; <span style=\"color:red\">local accuracy</span>, <span style=\"color:blue\">missingness</span>, and <span style=\"color:green\">consistency</span>.</p>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><h4 id=\"local-accuracy\">Local Accuracy</h4>\n<!--kg-card-end: markdown--><p>Local accuracy : it simply says if the input and the simplified input are roughly the same, then the actual model and the explanatory model should produce roughly the same output. </p><!--kg-card-begin: markdown--><h4 id=\"missingness\">Missingness</h4>\n<!--kg-card-end: markdown--><p>Missingness states that if a feature is excluded from the model, it’s attribution must be zero; that is, the only thing that can affect the output of the explanation model is the inclusion of features, not the exclusion. </p><!--kg-card-begin: markdown--><h4 id=\"consistency\">Consistency</h4>\n<!--kg-card-end: markdown--><p>Finally, we have consistency (and this one’s a little hard to represent mathematically), but it states that if the original model changes so that the a particular feature’s contribution changes, the attribution in the explanatory model cannot change in the opposite direction; </p><p>So for example, if we have a new model where a specific feature has a more positive contribution than in the original; the attribution in our new explanatory model cannot decrease. </p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222395551-04403b51-1e8f-4bbe-84d7-1bb2f9d0b966.png\" align=\"center\" alt=\"SHAP\" width=\"720\" height=\"299\"/><!--kg-card-end: markdown--><p>Now while a bunch of different explanation methods satisfy some of these properties, <em>Lundberg and Lee argue that only SHAP satisfies all three</em>; <strong><em>if the feature attributions in our additive explanatory model are specifically chosen to be the shapley values of those features, then all three properties are upheld.</em></strong></p><!--kg-card-begin: markdown--><h3 id=\"shapley-kernel\">Shapley Kernel</h3>\n<!--kg-card-end: markdown--><p>The problem with SHAP, however, is that computing Shapley values means you have to sample the coalition values for each possible feature permutation, which in a model explainability setting means we have to evaluate our model that number of times. </p><p>For a model that operates over 4 features, it’s easy enough, it’s just 16 coalitions to sample to get all the Shapley values. For 32 features, that’s over 17 billion samples, which is entirely untenable. <em>To get around this, Lundberg and Lee devise the Shapley Kernel, a means of approximating shapley values through much fewer samples.</em></p><p>So what we do is we pass samples through the model, of various feature permutations of the particular datapoint that we’re trying to explain.</p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222397887-4451072f-be0b-4e27-accc-e401b7ac35bf.png\" align=\"center\" alt=\"Shapley-Permu\" width=\"633\" height=\"407\"/><!--kg-card-end: markdown--><p>Of course, most ML models won’t just let you omit a feature !</p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222398010-2e845aae-b21e-48ea-b16e-5da41af83b86.png\" align=\"center\" alt=\"Shapley-Omit-Feature\" width=\"588\" height=\"112\"/>\n<!--kg-card-end: markdown--><p>So what we do is define a background dataset B, one that contains a set of representative datapoints that the model was trained over. </p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222398330-d64ddccf-b8f2-4f5e-880a-dfe05d99eb77.png\" align=\"center\" alt=\"BackgroundDataset\" width=\"636\" height=\"239\"/>\n<!--kg-card-end: markdown--><p>We then fill in our omitted feature or features with values from the background dataset, while holding the features that are included in the permutation fixed to their original values.</p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222398831-31f4f53d-2610-47d6-9b45-89a7ff663fe4.png\" align=\"center\" alt=\"IncludedFeaturesFromDatasetB\" width=\"621\" height=\"331\"/>\n<!--kg-card-end: markdown--><p>We then take the average of the model output over all of these new synthetic datapoints as our model output for that feature permutation, which we’ll call that y bar.</p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222399307-fc75df99-27a3-4d11-8fff-d785616fbee9.png\" align=\"center\" alt=\"Shapley-Avg\" width=\"720\" height=\"291\"/>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222399733-737c4ba1-23f2-4f97-996e-f3ace97fb3cf.png\" align=\"center\" alt=\"Shapley-ybar\" width=\"683\" height=\"340\"/>\n<!--kg-card-end: markdown--><p>So once we have a number of samples computed in this way, </p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222399945-9a2510d2-5595-42eb-887c-5b5762c9aa1e.png\" align=\"center\" alt=\"Shapley-NoOfYbar\" width=\"672\" height=\"400\"/>\n<!--kg-card-end: markdown--><p>We can formulate this as a weighted linear regression, with each feature assigned a coefficient.</p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222400421-c3964125-f4d1-44e1-ba12-1231cb76671a.png\" align=\"center\" alt=\"SShapley-LinearReg\" width=\"672\" height=\"400\"/><!--kg-card-end: markdown--><p>With a very specific choice of weighting for each sample, based on a combination of the total number of features in the model, the number of coalitions with the same number of features as this particular sample, and the number of features included and excluded in this permutation, we ensure that the solution to this weighted linear regression is such that the returned coefficients are equivalent to the Shapley values. </p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222400526-11e50df4-880d-44e8-bd15-812560a8e005.png\" align=\"center\" alt=\"Shapley-Weighting\" width=\"720\" height=\"148\"/><!--kg-card-end: markdown--><p>This weighting scheme is the basis of the Shapley Kernel, and the weighted linear regression process as a whole is Kernel SHAP.</p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222400590-c55c9b16-5ce4-4022-a6aa-73c294f2d1a8.png\" align=\"center\" alt=\"Shapley-Kernel-Weights\" width=\"720\" height=\"359\"/><!--kg-card-end: markdown--><p>Now, there are a lot of other forms of SHAP that are presented in the paper (Deep SHAP, Low Order SHAP , Max SHAP , Linear SHAP , Tree SHAP etc) , ones that make use of model-specific assumptions and optimizations to speed up the algorithm and the sampling process, but Kernel SHAP is the one among them that is universal and can be applied to any type of machine learning model. This general applicability is why we chose Kernel SHAP as the first form of SHAP to implement for TrustyAI.</p><!--kg-card-begin: markdown--><h3 id=\"kernel-shap-example\">Kernel SHAP Example</h3>\n<!--kg-card-end: markdown--><p>I’ll run through an example of the Python SHAP implementation provided by Lundberg and Lee. So first I’ll grab a dataset to run our example over, and I’ve picked the Boston housing price dataset, which is a dataset consisting of various attributes about Boston neighborhoods and the corresponding house prices within that neighborhood.</p><p>Incomplete. Coming soon</p>","url":"http://localhost:2368/shapley-additive-explanations-shap/","canonical_url":null,"uuid":"2239e072-7466-4d35-ba08-3fd48b607f1a","codeinjection_foot":null,"codeinjection_head":null,"codeinjection_styles":null,"comment_id":"63ff26ec72a3c427182edd36","reading_time":7,"send_email_when_published":null,"email_subject":null,"childHtmlRehype":{"html":"<p>In this article , we will talk about SHAP in detail. We will discuss What are Shapley Values ? , Mathematical foundation behind the Shapley Values and How does SHAP (Shapley Additive Explanations) reframes the Shapey Value problem in detail. Also we will discuss What is Local Accuracy, Missingness, and Consistency in the context of explainable models , What is the Shapley Kernel etc. </p><!--kg-card-begin: markdown--><h3 id=\"what-is-shapley-values\">What is Shapley Values ?</h3>\n<!--kg-card-end: markdown--><p>SHAP values were introduced  by <a href=\"https://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions.pdf\" rel=\"noopener ugc nofollow\">Lundberg and Lee (2016)</a> as a way to provide local explanations for machine learning models. They built upon Shapley's work in cooperative game theory, using the Shapley value to explain how each feature in a model contributes to the prediction for a particular instance.</p><blockquote class=\"kg-blockquote-alt\">Shapley values were invented by Lloyd Shapley as a way of providing a fair solution to the following question: <em>if we have a coalition <strong>C</strong> that collaborates to produce a value <strong>V</strong>, how much did each individual member contribute to that final value ?</em> </blockquote><p>So what does this mean ? We have a coalition C, a group of cooperating members that work together to produce some value V, called the coalition value. This could be something like, a corporation of employees that together generate a certain profit, or a dinner group running up a restaurant bill. We want to know exactly how much each member contributed to that final coalition value; what share of the profit does each employee deserve, how much each person in the dinner party owes to settle the bill.</p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222118086-efdfcda4-01e6-4d4d-bc6d-30ce019c464b.png\" align=\"center\" alt=\"Shapley Values\" width=\"720\" height=\"434\">\n<!--kg-card-end: markdown--><p>However, answering this gets tricky when there are interacting effects between members, when certain permutations cause members to contribute more than the sum of their parts. <strong><em>To find a fair answer to this question that takes into account these interaction effects, we can compute the Shapley value for each member of the coalition.</em></strong></p><p>So let’s how we can compute the Shapley value for member 1 of our example coalition. The way this is done is by <em>sampling a coalition that contains member 1, and then looking at the coalition formed by removing that member.</em> </p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222119980-a7c2c357-eecf-4ea9-8de6-7eefacf4e362.png\" align=\"center\" alt=\"Shapley Values\" width=\"720\" height=\"404\">\n<!--kg-card-end: markdown--><p>We then look at the respective values of these two coalitions, and compare the difference between the two.</p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222120650-7a9cd97e-3dfd-488c-9a24-45638a75847a.png\" align=\"center\" alt=\"Shapley Values\" width=\"720\" height=\"512\">\n<!--kg-card-end: markdown--><p><em>This difference is the marginal contribution of member 1 to the coalition consisting of members 2, 3, and 4; how much member 1 contributed to that specific group.</em></p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222120708-c27b48f7-7712-4e7d-ae44-c998a859215c.png\" align=\"center\" alt=\"Shapley Values\" width=\"720\" height=\"136\">\n<!--kg-card-end: markdown--><p>So we then enumerate all such pairs of coalitions, that is, all pairs of coalitions that only differ based on whether or not member 1 is included, and then look at all the marginal contributions for each. </p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222121573-e1c9b15d-d883-44f0-b6e3-41fff235ff06.png\" align=\"center\" alt=\"Shapley Values\" width=\"720\" height=\"333\">\n<!--kg-card-end: markdown--><p><em>The mean marginal contribution is the Shapley value of that member.</em></p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222121676-1e0e6852-0b66-44b7-bbd4-11fd8e25b39f.png\" align=\"center\" alt=\"Shapley Values\" width=\"720\" height=\"143\">\n<!--kg-card-end: markdown--><p>We can do this same process for each member of the coalition, and we’ve found a fair solution  to our original question.!</p><p>Mathematically, the whole process looks like this, but all we need is to know that the <strong><em>Shapley value is the average amount of contribution that a particular member makes to the coalition value.</em></strong></p><!--kg-card-begin: markdown--><h3 id=\"shapley-values-to-shap\">Shapley Values To SHAP</h3>\n<!--kg-card-end: markdown--><p>Now, translating this concept to model explainability is relatively straightforward, and that’s exactly what Scott Lundberg and Su-In Lee did in 2017 with their paper <a href=\"https://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions.pdf\">“A Unified Approach to Interpreting Model Predictions,”</a> where they introduced SHAP. </p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222123143-27853042-9e12-4526-91dd-466e04f8a68f.png\" align=\"center\" alt=\"Shapley Paper\" width=\"720\" height=\"666\">\n<!--kg-card-end: markdown--><p>SHAP reframes the Shapley value problem from one where we look at <em>how members of a coalition contribute to a coalition value</em> to one where we look at <em>how individual features contribute to a model’s outputs.</em></p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222123344-4acf9530-e696-4ee7-80f0-4835cd9cc775.png\" align=\"center\" alt=\"Shapley Paper\" width=\"580\" height=\"352\"><!--kg-card-end: markdown--><!--kg-card-begin: markdown--><p>They do this in a very specific way, one that we can get a clue to by looking at the name of their algorithm; <span style=\"color:red\">Shapley</span> <span style=\"color:blue\">Additive</span> <span style=\"color:green\">Explanations</span>. We know what <span style=\"color:red\">Shapley</span> values are, we know what <span style=\"color:green\">Explanations</span> are, but what do they mean by <span style=\"color:blue\">Additive</span>?</p>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><p>Lundberg and Lee define an additive feature attribution as follows: if we have a set a of <span style=\"color:blue\">inputs x</span>, and <span style=\"color:blue\">a model f(x)</span>, we can define a set of simplified <span style=\"color:green\">local inputs x’</span> (which usually means that we turn a feature vector into a discrete binary vector, where features are either included or excluded) and we can also define an <span style=\"color:green\">explanatory model g</span>.</p>\n<!--kg-card-end: markdown--><p>What we need to ensure is that One: if x’ is roughly equal to x then g(x’) should be roughly equal to f(x)</p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222126320-96c62ae5-2dcd-46eb-8a64-38727338c7dd.png\" align=\"center\" alt=\"SHAP\" width=\"404\" height=\"222\"><!--kg-card-end: markdown--><p>two: g must take below form, where <em>phi_0 is the null output of the model</em>, that is, the average output of the model, and <em>phi_i is the explained effect of feature_i</em>; how much that feature changes the output of the model. <em>This is called it’s attribution.</em></p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222126643-5e4a7d98-53a6-4389-86cd-ac6d37041150.png\" align=\"center\" alt=\"SHAP\" width=\"625\" height=\"261\"><!--kg-card-end: markdown--><p>If we have these two, we have an explanatory model that has additive feature attribution. The advantage of this form of explanation is really easy to interpret; we can see the exact contribution and importance of each feature just by looking at the phi values.</p><!--kg-card-begin: markdown--><h3 id=\"local-accuracy-missingness-and-consistency\">Local Accuracy, Missingness and Consistency</h3>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><p>Now Lundberg and Lee go on to describe a set of three desirable properties of such an additive feature method; <span style=\"color:red\">local accuracy</span>, <span style=\"color:blue\">missingness</span>, and <span style=\"color:green\">consistency</span>.</p>\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><h4 id=\"local-accuracy\">Local Accuracy</h4>\n<!--kg-card-end: markdown--><p>Local accuracy : it simply says if the input and the simplified input are roughly the same, then the actual model and the explanatory model should produce roughly the same output. </p><!--kg-card-begin: markdown--><h4 id=\"missingness\">Missingness</h4>\n<!--kg-card-end: markdown--><p>Missingness states that if a feature is excluded from the model, it’s attribution must be zero; that is, the only thing that can affect the output of the explanation model is the inclusion of features, not the exclusion. </p><!--kg-card-begin: markdown--><h4 id=\"consistency\">Consistency</h4>\n<!--kg-card-end: markdown--><p>Finally, we have consistency (and this one’s a little hard to represent mathematically), but it states that if the original model changes so that the a particular feature’s contribution changes, the attribution in the explanatory model cannot change in the opposite direction; </p><p>So for example, if we have a new model where a specific feature has a more positive contribution than in the original; the attribution in our new explanatory model cannot decrease. </p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222395551-04403b51-1e8f-4bbe-84d7-1bb2f9d0b966.png\" align=\"center\" alt=\"SHAP\" width=\"720\" height=\"299\"><!--kg-card-end: markdown--><p>Now while a bunch of different explanation methods satisfy some of these properties, <em>Lundberg and Lee argue that only SHAP satisfies all three</em>; <strong><em>if the feature attributions in our additive explanatory model are specifically chosen to be the shapley values of those features, then all three properties are upheld.</em></strong></p><!--kg-card-begin: markdown--><h3 id=\"shapley-kernel\">Shapley Kernel</h3>\n<!--kg-card-end: markdown--><p>The problem with SHAP, however, is that computing Shapley values means you have to sample the coalition values for each possible feature permutation, which in a model explainability setting means we have to evaluate our model that number of times. </p><p>For a model that operates over 4 features, it’s easy enough, it’s just 16 coalitions to sample to get all the Shapley values. For 32 features, that’s over 17 billion samples, which is entirely untenable. <em>To get around this, Lundberg and Lee devise the Shapley Kernel, a means of approximating shapley values through much fewer samples.</em></p><p>So what we do is we pass samples through the model, of various feature permutations of the particular datapoint that we’re trying to explain.</p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222397887-4451072f-be0b-4e27-accc-e401b7ac35bf.png\" align=\"center\" alt=\"Shapley-Permu\" width=\"633\" height=\"407\"><!--kg-card-end: markdown--><p>Of course, most ML models won’t just let you omit a feature !</p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222398010-2e845aae-b21e-48ea-b16e-5da41af83b86.png\" align=\"center\" alt=\"Shapley-Omit-Feature\" width=\"588\" height=\"112\">\n<!--kg-card-end: markdown--><p>So what we do is define a background dataset B, one that contains a set of representative datapoints that the model was trained over. </p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222398330-d64ddccf-b8f2-4f5e-880a-dfe05d99eb77.png\" align=\"center\" alt=\"BackgroundDataset\" width=\"636\" height=\"239\">\n<!--kg-card-end: markdown--><p>We then fill in our omitted feature or features with values from the background dataset, while holding the features that are included in the permutation fixed to their original values.</p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222398831-31f4f53d-2610-47d6-9b45-89a7ff663fe4.png\" align=\"center\" alt=\"IncludedFeaturesFromDatasetB\" width=\"621\" height=\"331\">\n<!--kg-card-end: markdown--><p>We then take the average of the model output over all of these new synthetic datapoints as our model output for that feature permutation, which we’ll call that y bar.</p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222399307-fc75df99-27a3-4d11-8fff-d785616fbee9.png\" align=\"center\" alt=\"Shapley-Avg\" width=\"720\" height=\"291\">\n<!--kg-card-end: markdown--><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222399733-737c4ba1-23f2-4f97-996e-f3ace97fb3cf.png\" align=\"center\" alt=\"Shapley-ybar\" width=\"683\" height=\"340\">\n<!--kg-card-end: markdown--><p>So once we have a number of samples computed in this way, </p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222399945-9a2510d2-5595-42eb-887c-5b5762c9aa1e.png\" align=\"center\" alt=\"Shapley-NoOfYbar\" width=\"672\" height=\"400\">\n<!--kg-card-end: markdown--><p>We can formulate this as a weighted linear regression, with each feature assigned a coefficient.</p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222400421-c3964125-f4d1-44e1-ba12-1231cb76671a.png\" align=\"center\" alt=\"SShapley-LinearReg\" width=\"672\" height=\"400\"><!--kg-card-end: markdown--><p>With a very specific choice of weighting for each sample, based on a combination of the total number of features in the model, the number of coalitions with the same number of features as this particular sample, and the number of features included and excluded in this permutation, we ensure that the solution to this weighted linear regression is such that the returned coefficients are equivalent to the Shapley values. </p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222400526-11e50df4-880d-44e8-bd15-812560a8e005.png\" align=\"center\" alt=\"Shapley-Weighting\" width=\"720\" height=\"148\"><!--kg-card-end: markdown--><p>This weighting scheme is the basis of the Shapley Kernel, and the weighted linear regression process as a whole is Kernel SHAP.</p><!--kg-card-begin: markdown--><img src=\"https://user-images.githubusercontent.com/33357428/222400590-c55c9b16-5ce4-4022-a6aa-73c294f2d1a8.png\" align=\"center\" alt=\"Shapley-Kernel-Weights\" width=\"720\" height=\"359\"><!--kg-card-end: markdown--><p>Now, there are a lot of other forms of SHAP that are presented in the paper (Deep SHAP, Low Order SHAP , Max SHAP , Linear SHAP , Tree SHAP etc) , ones that make use of model-specific assumptions and optimizations to speed up the algorithm and the sampling process, but Kernel SHAP is the one among them that is universal and can be applied to any type of machine learning model. This general applicability is why we chose Kernel SHAP as the first form of SHAP to implement for TrustyAI.</p><!--kg-card-begin: markdown--><h3 id=\"kernel-shap-example\">Kernel SHAP Example</h3>\n<!--kg-card-end: markdown--><p>I’ll run through an example of the Python SHAP implementation provided by Lundberg and Lee. So first I’ll grab a dataset to run our example over, and I’ve picked the Boston housing price dataset, which is a dataset consisting of various attributes about Boston neighborhoods and the corresponding house prices within that neighborhood.</p><p>Incomplete. Coming soon</p>","htmlAst":{"type":"root","children":[{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"In this article , we will talk about SHAP in detail. We will discuss What are Shapley Values ? , Mathematical foundation behind the Shapley Values and How does SHAP (Shapley Additive Explanations) reframes the Shapey Value problem in detail. Also we will discuss What is Local Accuracy, Missingness, and Consistency in the context of explainable models , What is the Shapley Kernel etc. "}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h3","properties":{"id":"what-is-shapley-values"},"children":[{"type":"text","value":"What is Shapley Values ?"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"SHAP values were introduced  by "},{"type":"element","tagName":"a","properties":{"href":"https://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions.pdf","rel":["noopener","ugc","nofollow"]},"children":[{"type":"text","value":"Lundberg and Lee (2016)"}]},{"type":"text","value":" as a way to provide local explanations for machine learning models. They built upon Shapley's work in cooperative game theory, using the Shapley value to explain how each feature in a model contributes to the prediction for a particular instance."}]},{"type":"element","tagName":"blockquote","properties":{"className":["kg-blockquote-alt"]},"children":[{"type":"text","value":"Shapley values were invented by Lloyd Shapley as a way of providing a fair solution to the following question: "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"if we have a coalition "},{"type":"element","tagName":"strong","properties":{},"children":[{"type":"text","value":"C"}]},{"type":"text","value":" that collaborates to produce a value "},{"type":"element","tagName":"strong","properties":{},"children":[{"type":"text","value":"V"}]},{"type":"text","value":", how much did each individual member contribute to that final value ?"}]},{"type":"text","value":" "}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"So what does this mean ? We have a coalition C, a group of cooperating members that work together to produce some value V, called the coalition value. This could be something like, a corporation of employees that together generate a certain profit, or a dinner group running up a restaurant bill. We want to know exactly how much each member contributed to that final coalition value; what share of the profit does each employee deserve, how much each person in the dinner party owes to settle the bill."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/222118086-efdfcda4-01e6-4d4d-bc6d-30ce019c464b.png","align":"center","alt":"Shapley Values","width":720,"height":434},"children":[]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"However, answering this gets tricky when there are interacting effects between members, when certain permutations cause members to contribute more than the sum of their parts. "},{"type":"element","tagName":"strong","properties":{},"children":[{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"To find a fair answer to this question that takes into account these interaction effects, we can compute the Shapley value for each member of the coalition."}]}]}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"So let’s how we can compute the Shapley value for member 1 of our example coalition. The way this is done is by "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"sampling a coalition that contains member 1, and then looking at the coalition formed by removing that member."}]},{"type":"text","value":" "}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/222119980-a7c2c357-eecf-4ea9-8de6-7eefacf4e362.png","align":"center","alt":"Shapley Values","width":720,"height":404},"children":[]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"We then look at the respective values of these two coalitions, and compare the difference between the two."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/222120650-7a9cd97e-3dfd-488c-9a24-45638a75847a.png","align":"center","alt":"Shapley Values","width":720,"height":512},"children":[]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"This difference is the marginal contribution of member 1 to the coalition consisting of members 2, 3, and 4; how much member 1 contributed to that specific group."}]}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/222120708-c27b48f7-7712-4e7d-ae44-c998a859215c.png","align":"center","alt":"Shapley Values","width":720,"height":136},"children":[]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"So we then enumerate all such pairs of coalitions, that is, all pairs of coalitions that only differ based on whether or not member 1 is included, and then look at all the marginal contributions for each. "}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/222121573-e1c9b15d-d883-44f0-b6e3-41fff235ff06.png","align":"center","alt":"Shapley Values","width":720,"height":333},"children":[]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"The mean marginal contribution is the Shapley value of that member."}]}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/222121676-1e0e6852-0b66-44b7-bbd4-11fd8e25b39f.png","align":"center","alt":"Shapley Values","width":720,"height":143},"children":[]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"We can do this same process for each member of the coalition, and we’ve found a fair solution  to our original question.!"}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Mathematically, the whole process looks like this, but all we need is to know that the "},{"type":"element","tagName":"strong","properties":{},"children":[{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"Shapley value is the average amount of contribution that a particular member makes to the coalition value."}]}]}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h3","properties":{"id":"shapley-values-to-shap"},"children":[{"type":"text","value":"Shapley Values To SHAP"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Now, translating this concept to model explainability is relatively straightforward, and that’s exactly what Scott Lundberg and Su-In Lee did in 2017 with their paper "},{"type":"element","tagName":"a","properties":{"href":"https://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions.pdf"},"children":[{"type":"text","value":"“A Unified Approach to Interpreting Model Predictions,”"}]},{"type":"text","value":" where they introduced SHAP. "}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/222123143-27853042-9e12-4526-91dd-466e04f8a68f.png","align":"center","alt":"Shapley Paper","width":720,"height":666},"children":[]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"SHAP reframes the Shapley value problem from one where we look at "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"how members of a coalition contribute to a coalition value"}]},{"type":"text","value":" to one where we look at "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"how individual features contribute to a model’s outputs."}]}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/222123344-4acf9530-e696-4ee7-80f0-4835cd9cc775.png","align":"center","alt":"Shapley Paper","width":580,"height":352},"children":[]},{"type":"comment","value":"kg-card-end: markdown"},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"They do this in a very specific way, one that we can get a clue to by looking at the name of their algorithm; "},{"type":"element","tagName":"span","properties":{"style":"color:red"},"children":[{"type":"text","value":"Shapley"}]},{"type":"text","value":" "},{"type":"element","tagName":"span","properties":{"style":"color:blue"},"children":[{"type":"text","value":"Additive"}]},{"type":"text","value":" "},{"type":"element","tagName":"span","properties":{"style":"color:green"},"children":[{"type":"text","value":"Explanations"}]},{"type":"text","value":". We know what "},{"type":"element","tagName":"span","properties":{"style":"color:red"},"children":[{"type":"text","value":"Shapley"}]},{"type":"text","value":" values are, we know what "},{"type":"element","tagName":"span","properties":{"style":"color:green"},"children":[{"type":"text","value":"Explanations"}]},{"type":"text","value":" are, but what do they mean by "},{"type":"element","tagName":"span","properties":{"style":"color:blue"},"children":[{"type":"text","value":"Additive"}]},{"type":"text","value":"?"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Lundberg and Lee define an additive feature attribution as follows: if we have a set a of "},{"type":"element","tagName":"span","properties":{"style":"color:blue"},"children":[{"type":"text","value":"inputs x"}]},{"type":"text","value":", and "},{"type":"element","tagName":"span","properties":{"style":"color:blue"},"children":[{"type":"text","value":"a model f(x)"}]},{"type":"text","value":", we can define a set of simplified "},{"type":"element","tagName":"span","properties":{"style":"color:green"},"children":[{"type":"text","value":"local inputs x’"}]},{"type":"text","value":" (which usually means that we turn a feature vector into a discrete binary vector, where features are either included or excluded) and we can also define an "},{"type":"element","tagName":"span","properties":{"style":"color:green"},"children":[{"type":"text","value":"explanatory model g"}]},{"type":"text","value":"."}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"What we need to ensure is that One: if x’ is roughly equal to x then g(x’) should be roughly equal to f(x)"}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/222126320-96c62ae5-2dcd-46eb-8a64-38727338c7dd.png","align":"center","alt":"SHAP","width":404,"height":222},"children":[]},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"two: g must take below form, where "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"phi_0 is the null output of the model"}]},{"type":"text","value":", that is, the average output of the model, and "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"phi_i is the explained effect of feature_i"}]},{"type":"text","value":"; how much that feature changes the output of the model. "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"This is called it’s attribution."}]}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/222126643-5e4a7d98-53a6-4389-86cd-ac6d37041150.png","align":"center","alt":"SHAP","width":625,"height":261},"children":[]},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"If we have these two, we have an explanatory model that has additive feature attribution. The advantage of this form of explanation is really easy to interpret; we can see the exact contribution and importance of each feature just by looking at the phi values."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h3","properties":{"id":"local-accuracy-missingness-and-consistency"},"children":[{"type":"text","value":"Local Accuracy, Missingness and Consistency"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Now Lundberg and Lee go on to describe a set of three desirable properties of such an additive feature method; "},{"type":"element","tagName":"span","properties":{"style":"color:red"},"children":[{"type":"text","value":"local accuracy"}]},{"type":"text","value":", "},{"type":"element","tagName":"span","properties":{"style":"color:blue"},"children":[{"type":"text","value":"missingness"}]},{"type":"text","value":", and "},{"type":"element","tagName":"span","properties":{"style":"color:green"},"children":[{"type":"text","value":"consistency"}]},{"type":"text","value":"."}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h4","properties":{"id":"local-accuracy"},"children":[{"type":"text","value":"Local Accuracy"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Local accuracy : it simply says if the input and the simplified input are roughly the same, then the actual model and the explanatory model should produce roughly the same output. "}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h4","properties":{"id":"missingness"},"children":[{"type":"text","value":"Missingness"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Missingness states that if a feature is excluded from the model, it’s attribution must be zero; that is, the only thing that can affect the output of the explanation model is the inclusion of features, not the exclusion. "}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h4","properties":{"id":"consistency"},"children":[{"type":"text","value":"Consistency"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Finally, we have consistency (and this one’s a little hard to represent mathematically), but it states that if the original model changes so that the a particular feature’s contribution changes, the attribution in the explanatory model cannot change in the opposite direction; "}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"So for example, if we have a new model where a specific feature has a more positive contribution than in the original; the attribution in our new explanatory model cannot decrease. "}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/222395551-04403b51-1e8f-4bbe-84d7-1bb2f9d0b966.png","align":"center","alt":"SHAP","width":720,"height":299},"children":[]},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Now while a bunch of different explanation methods satisfy some of these properties, "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"Lundberg and Lee argue that only SHAP satisfies all three"}]},{"type":"text","value":"; "},{"type":"element","tagName":"strong","properties":{},"children":[{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"if the feature attributions in our additive explanatory model are specifically chosen to be the shapley values of those features, then all three properties are upheld."}]}]}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h3","properties":{"id":"shapley-kernel"},"children":[{"type":"text","value":"Shapley Kernel"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"The problem with SHAP, however, is that computing Shapley values means you have to sample the coalition values for each possible feature permutation, which in a model explainability setting means we have to evaluate our model that number of times. "}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"For a model that operates over 4 features, it’s easy enough, it’s just 16 coalitions to sample to get all the Shapley values. For 32 features, that’s over 17 billion samples, which is entirely untenable. "},{"type":"element","tagName":"em","properties":{},"children":[{"type":"text","value":"To get around this, Lundberg and Lee devise the Shapley Kernel, a means of approximating shapley values through much fewer samples."}]}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"So what we do is we pass samples through the model, of various feature permutations of the particular datapoint that we’re trying to explain."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/222397887-4451072f-be0b-4e27-accc-e401b7ac35bf.png","align":"center","alt":"Shapley-Permu","width":633,"height":407},"children":[]},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Of course, most ML models won’t just let you omit a feature !"}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/222398010-2e845aae-b21e-48ea-b16e-5da41af83b86.png","align":"center","alt":"Shapley-Omit-Feature","width":588,"height":112},"children":[]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"So what we do is define a background dataset B, one that contains a set of representative datapoints that the model was trained over. "}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/222398330-d64ddccf-b8f2-4f5e-880a-dfe05d99eb77.png","align":"center","alt":"BackgroundDataset","width":636,"height":239},"children":[]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"We then fill in our omitted feature or features with values from the background dataset, while holding the features that are included in the permutation fixed to their original values."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/222398831-31f4f53d-2610-47d6-9b45-89a7ff663fe4.png","align":"center","alt":"IncludedFeaturesFromDatasetB","width":621,"height":331},"children":[]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"We then take the average of the model output over all of these new synthetic datapoints as our model output for that feature permutation, which we’ll call that y bar."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/222399307-fc75df99-27a3-4d11-8fff-d785616fbee9.png","align":"center","alt":"Shapley-Avg","width":720,"height":291},"children":[]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/222399733-737c4ba1-23f2-4f97-996e-f3ace97fb3cf.png","align":"center","alt":"Shapley-ybar","width":683,"height":340},"children":[]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"So once we have a number of samples computed in this way, "}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/222399945-9a2510d2-5595-42eb-887c-5b5762c9aa1e.png","align":"center","alt":"Shapley-NoOfYbar","width":672,"height":400},"children":[]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"We can formulate this as a weighted linear regression, with each feature assigned a coefficient."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/222400421-c3964125-f4d1-44e1-ba12-1231cb76671a.png","align":"center","alt":"SShapley-LinearReg","width":672,"height":400},"children":[]},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"With a very specific choice of weighting for each sample, based on a combination of the total number of features in the model, the number of coalitions with the same number of features as this particular sample, and the number of features included and excluded in this permutation, we ensure that the solution to this weighted linear regression is such that the returned coefficients are equivalent to the Shapley values. "}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/222400526-11e50df4-880d-44e8-bd15-812560a8e005.png","align":"center","alt":"Shapley-Weighting","width":720,"height":148},"children":[]},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"This weighting scheme is the basis of the Shapley Kernel, and the weighted linear regression process as a whole is Kernel SHAP."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"img","properties":{"src":"https://user-images.githubusercontent.com/33357428/222400590-c55c9b16-5ce4-4022-a6aa-73c294f2d1a8.png","align":"center","alt":"Shapley-Kernel-Weights","width":720,"height":359},"children":[]},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Now, there are a lot of other forms of SHAP that are presented in the paper (Deep SHAP, Low Order SHAP , Max SHAP , Linear SHAP , Tree SHAP etc) , ones that make use of model-specific assumptions and optimizations to speed up the algorithm and the sampling process, but Kernel SHAP is the one among them that is universal and can be applied to any type of machine learning model. This general applicability is why we chose Kernel SHAP as the first form of SHAP to implement for TrustyAI."}]},{"type":"comment","value":"kg-card-begin: markdown"},{"type":"element","tagName":"h3","properties":{"id":"kernel-shap-example"},"children":[{"type":"text","value":"Kernel SHAP Example"}]},{"type":"text","value":"\n"},{"type":"comment","value":"kg-card-end: markdown"},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"I’ll run through an example of the Python SHAP implementation provided by Lundberg and Lee. So first I’ll grab a dataset to run our example over, and I’ve picked the Boston housing price dataset, which is a dataset consisting of various attributes about Boston neighborhoods and the corresponding house prices within that neighborhood."}]},{"type":"element","tagName":"p","properties":{},"children":[{"type":"text","value":"Incomplete. Coming soon"}]}],"data":{"quirksMode":false}},"tableOfContents":[{"id":"what-is-shapley-values","heading":"What is Shapley Values ?"},{"id":"shapley-values-to-shap","heading":"Shapley Values To SHAP"},{"id":"local-accuracy-missingness-and-consistency","heading":"Local Accuracy, Missingness and Consistency","items":[{"id":"local-accuracy","heading":"Local Accuracy"},{"id":"missingness","heading":"Missingness"},{"id":"consistency","heading":"Consistency"}]},{"id":"shapley-kernel","heading":"Shapley Kernel"},{"id":"kernel-shap-example","heading":"Kernel SHAP Example"}]},"featureImageSharp":{"base":"photo-1522069213448-443a614da9b6.jpg","publicURL":"/static/c604d810168bc7c01df3faa4bf1a4f00/photo-1522069213448-443a614da9b6.jpg","imageMeta":{"width":2000,"height":1330},"childImageSharp":{"fluid":{"base64":"data:image/jpeg;base64,/9j/2wBDABALDA4MChAODQ4SERATGCgaGBYWGDEjJR0oOjM9PDkzODdASFxOQERXRTc4UG1RV19iZ2hnPk1xeXBkeFxlZ2P/2wBDARESEhgVGC8aGi9jQjhCY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2P/wgARCAANABQDASIAAhEBAxEB/8QAFwABAQEBAAAAAAAAAAAAAAAAAAMBBf/EABQBAQAAAAAAAAAAAAAAAAAAAAH/2gAMAwEAAhADEAAAAeVsLDEJ/8QAGxAAAgEFAAAAAAAAAAAAAAAAARIDAAIQERP/2gAIAQEAAQUCjXZjK46XJX//xAAWEQADAAAAAAAAAAAAAAAAAAAQESH/2gAIAQMBAT8BiH//xAAUEQEAAAAAAAAAAAAAAAAAAAAQ/9oACAECAQE/AT//xAAYEAADAQEAAAAAAAAAAAAAAAAQEkEAIv/aAAgBAQAGPwLrNCsH/8QAGRABAQADAQAAAAAAAAAAAAAAAREAEIFx/9oACAEBAAE/IQlF7MG/Tm1Lun//2gAMAwEAAgADAAAAEAwP/8QAFhEBAQEAAAAAAAAAAAAAAAAAARAh/9oACAEDAQE/EBwSf//EABYRAQEBAAAAAAAAAAAAAAAAABEBEP/aAAgBAgEBPxArn//EABwQAAICAgMAAAAAAAAAAAAAAAERITEQYQBBwf/aAAgBAQABPxAarAAhquZ4ENRbaNF+Zr1qAFA6x//Z","aspectRatio":1.5086206896551724,"src":"/static/c604d810168bc7c01df3faa4bf1a4f00/ea4ab/photo-1522069213448-443a614da9b6.jpg","srcSet":"/static/c604d810168bc7c01df3faa4bf1a4f00/477ba/photo-1522069213448-443a614da9b6.jpg 175w,\n/static/c604d810168bc7c01df3faa4bf1a4f00/06776/photo-1522069213448-443a614da9b6.jpg 350w,\n/static/c604d810168bc7c01df3faa4bf1a4f00/ea4ab/photo-1522069213448-443a614da9b6.jpg 700w,\n/static/c604d810168bc7c01df3faa4bf1a4f00/3055e/photo-1522069213448-443a614da9b6.jpg 1050w,\n/static/c604d810168bc7c01df3faa4bf1a4f00/eff08/photo-1522069213448-443a614da9b6.jpg 1400w,\n/static/c604d810168bc7c01df3faa4bf1a4f00/4e5f3/photo-1522069213448-443a614da9b6.jpg 2000w","srcWebp":"/static/c604d810168bc7c01df3faa4bf1a4f00/89afa/photo-1522069213448-443a614da9b6.webp","srcSetWebp":"/static/c604d810168bc7c01df3faa4bf1a4f00/9fca7/photo-1522069213448-443a614da9b6.webp 175w,\n/static/c604d810168bc7c01df3faa4bf1a4f00/37a4e/photo-1522069213448-443a614da9b6.webp 350w,\n/static/c604d810168bc7c01df3faa4bf1a4f00/89afa/photo-1522069213448-443a614da9b6.webp 700w,\n/static/c604d810168bc7c01df3faa4bf1a4f00/78e7a/photo-1522069213448-443a614da9b6.webp 1050w,\n/static/c604d810168bc7c01df3faa4bf1a4f00/03d34/photo-1522069213448-443a614da9b6.webp 1400w,\n/static/c604d810168bc7c01df3faa4bf1a4f00/49d6b/photo-1522069213448-443a614da9b6.webp 2000w","sizes":"(max-width: 700px) 100vw, 700px"}}}}}]}},"pageContext":{"slug":"causal-machine-learning-part-5","prev":"explaining-bayesian-neural-networks","next":"causal-machine-learning-part-4","tag":"machine-learning","limit":3,"skip":0,"primaryTagCount":7,"collectionPaths":{}}},"staticQueryHashes":["1272700106","1676991999","2138873178","2546165603","2681841279","2938721187","293880488","3052966952","4156497161"]}