Skip to content

Scores, Chi2, etc. with BNLearner

Creative Commons LicenseaGrUMinteractive online version
import pyagrum as gum
import pyagrum.lib.notebook as gnb
bn = gum.loadBN("res/asia.bif")
bn
G dyspnoea dyspnoea tuberculosis tuberculosis tuberculos_or_cancer tuberculos_or_cancer tuberculosis->tuberculos_or_cancer smoking smoking bronchitis bronchitis smoking->bronchitis lung_cancer lung_cancer smoking->lung_cancer bronchitis->dyspnoea visit_to_Asia visit_to_Asia visit_to_Asia->tuberculosis positive_XraY positive_XraY lung_cancer->tuberculos_or_cancer tuberculos_or_cancer->dyspnoea tuberculos_or_cancer->positive_XraY
## we create a quite large database
gum.generateSample(bn, 500000, "out/sample_score.csv", False)
-1613359.4355354053

Testing d-separations using chi2 in the database

Section titled “Testing d-separations using chi2 in the database”
## do not forget that the generation process above is random : from time to time, the tests my not be correct...
def isIndep(pvalue):
return pvalue >= 0.05
def testIndepFromChi2(learner, var1, var2, kno=[]):
"""
Just prints the resultat of the chi2
"""
stat, pvalue = learner.chi2(var1, var2, kno)
if len(kno) == 0:
print("From Chi2 tests, is '{}' indep from '{}' ==> {}".format(var1, var2, isIndep(pvalue)))
else:
print("From Chi2 tests, is '{}' indep from '{}' given {} : {}".format(var1, var2, kno, isIndep(pvalue)))
learner = gum.BNLearner("out/sample_score.csv")
testIndepFromChi2(learner, "visit_to_Asia", "smoking")
testIndepFromChi2(learner, "visit_to_Asia", "smoking", ["tuberculos_or_cancer"])
testIndepFromChi2(learner, "visit_to_Asia", "smoking", ["positive_XraY"])
testIndepFromChi2(learner, "dyspnoea", "smoking")
testIndepFromChi2(learner, "dyspnoea", "smoking", ["lung_cancer", "bronchitis"])
From Chi2 tests, is 'visit_to_Asia' indep from 'smoking' ==> True
From Chi2 tests, is 'visit_to_Asia' indep from 'smoking' given ['tuberculos_or_cancer'] : False
From Chi2 tests, is 'visit_to_Asia' indep from 'smoking' given ['positive_XraY'] : False
From Chi2 tests, is 'dyspnoea' indep from 'smoking' ==> False
From Chi2 tests, is 'dyspnoea' indep from 'smoking' given ['lung_cancer', 'bronchitis'] : True

Evolution of chi2 p-values w.r.t the size of the database (in Asia)

Section titled “Evolution of chi2 p-values w.r.t the size of the database (in Asia)”
def consolidationIndepFromChi2(bn, size, lindep, nbr=20):
"""
Using $nbr$ generated databases of size $size$ from the bn $bn$,
consolidate the p-value for a list $lindep$ of conditional independence to test.
return the list of consolidated pValues
"""
pvalue_cumul = [0.0] * len(lindep)
for i in range(nbr):
gum.generateSample(bn, size, "out/sample_score.csv", False)
learner = gum.BNLearner("out/sample_score.csv")
for i, (var1, var2, kno) in enumerate(lindep):
stat, pvalue = learner.chi2(var1, var2, kno)
pvalue_cumul[i] += pvalue
return [p / nbr for p in pvalue_cumul]
sizes = [50, 100, 500, 1000, 2000, 5000, 10000, 20000, 50000, 100000, 200000]
pvalues1, pvalues2, pvalues3, pvalues4, pvalues5, pvalues6 = zip(
*[
consolidationIndepFromChi2(
bn,
siz,
[
("visit_to_Asia", "smoking", ["tuberculos_or_cancer"]),
("visit_to_Asia", "smoking", []),
("dyspnoea", "smoking", []),
("dyspnoea", "smoking", ["lung_cancer", "bronchitis"]),
("tuberculosis", "bronchitis", []),
("tuberculosis", "bronchitis", ["dyspnoea"]),
],
)
for siz in sizes
]
)
FIGSIZE = (8, 3)
%matplotlib inline
from pylab import *
import matplotlib.patches as patches
fig = figure(figsize=FIGSIZE)
ax = fig.add_subplot(1, 1, 1)
ax.plot(sizes, pvalues1, label="NOT(A indep S given TorC)", linestyle="dashed")
ax.plot(sizes, pvalues2, label="A indep S")
ax.plot(sizes, pvalues3, label="NOT(D indep S)", linestyle="dashed")
ax.plot(sizes, pvalues4, label="D indep S given L,B")
ax.plot(sizes, pvalues5, label="T indep B")
ax.plot(sizes, pvalues6, label="NOT(T indep B given D)", linestyle="dashed")
ax.tick_params(rotation=90)
ax.set_xlabel("data size")
ax.set_ylabel("pValue")
ax.legend(bbox_to_anchor=(0.15, 0.88, 0.7, 0.102), loc=3, ncol=3, mode="expand", borderaxespad=0.0)
rect = patches.Rectangle((0, 0), max(sizes), 0.05, linewidth=1, edgecolor="#FF8888", facecolor="#FF8888")
ax.add_patch(rect)
ax.annotate(
"Critical region",
xytext=(190000, 0.2),
xy=(190000, 0.05),
ha="right",
va="center",
arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=-0.15"),
bbox=dict(boxstyle="square", fc="w"),
)
ax.set_title("Chi2 pvalue=f(datasize)")
gnb.flow.add(fig)
gnb.flow.add(gnb.getBN(gum.fastBN("A->T->TorC->X;S->C->TorC->D<-B<-S")))
gnb.flow.new_line()
fig

svg

Testing d-separations using G2 in the database

Section titled “Testing d-separations using G2 in the database”
def testIndepFromG2(learner, var1, var2, kno=[]):
"""
Just prints the resultat of the G2
"""
stat, pvalue = learner.G2(var1, var2, kno)
if len(kno) == 0:
print("From G2 tests, is '{}' indep from '{}' ==> {}".format(var1, var2, isIndep(pvalue)))
else:
print("From G2 tests, is '{}' indep from '{}' given {} : {}".format(var1, var2, kno, isIndep(pvalue)))
learner = gum.BNLearner("out/sample_score.csv")
testIndepFromG2(learner, "visit_to_Asia", "smoking")
testIndepFromG2(learner, "visit_to_Asia", "smoking", ["tuberculos_or_cancer"])
testIndepFromG2(learner, "visit_to_Asia", "smoking", ["positive_XraY"])
testIndepFromG2(learner, "dyspnoea", "smoking")
testIndepFromG2(learner, "dyspnoea", "smoking", ["lung_cancer", "bronchitis"])
From G2 tests, is 'visit_to_Asia' indep from 'smoking' ==> True
From G2 tests, is 'visit_to_Asia' indep from 'smoking' given ['tuberculos_or_cancer'] : False
From G2 tests, is 'visit_to_Asia' indep from 'smoking' given ['positive_XraY'] : False
From G2 tests, is 'dyspnoea' indep from 'smoking' ==> False
From G2 tests, is 'dyspnoea' indep from 'smoking' given ['lung_cancer', 'bronchitis'] : True

Evolution of G2 p-values w.r.t the size of the database (in Asia)

Section titled “Evolution of G2 p-values w.r.t the size of the database (in Asia)”
def consolidationIndepFromG2(bn, size, lindep, nbr=20):
"""
Using $nbr$ generated databases of size $size$ from the bn $bn$,
consolidate the p-value for a list $lindep$ of conditional independence to test.
return the list of consolidated pValues
"""
pvalue_cumul = [0.0] * len(lindep)
for i in range(nbr):
gum.generateSample(bn, size, "out/sample_chi2.csv", False)
learner = gum.BNLearner("out/sample_chi2.csv")
for i, (var1, var2, kno) in enumerate(lindep):
stat, pvalue = learner.G2(var1, var2, kno)
pvalue_cumul[i] += pvalue
return [p / nbr for p in pvalue_cumul]
sizes = [50, 100, 500, 1000, 2000, 5000, 10000, 20000, 50000, 100000, 200000]
pvalues1, pvalues2, pvalues3, pvalues4, pvalues5, pvalues6 = zip(
*[
consolidationIndepFromG2(
bn,
siz,
[
("visit_to_Asia", "smoking", ["tuberculos_or_cancer"]),
("visit_to_Asia", "smoking", []),
("dyspnoea", "smoking", []),
("dyspnoea", "smoking", ["lung_cancer", "bronchitis"]),
("tuberculosis", "bronchitis", []),
("tuberculosis", "bronchitis", ["dyspnoea"]),
],
)
for siz in sizes
]
)
fig = figure(figsize=FIGSIZE)
ax = fig.add_subplot(1, 1, 1)
ax.plot(sizes, pvalues1, label="NOT(A indep S given TorC)", linestyle="dashed")
ax.plot(sizes, pvalues2, label="A indep S")
ax.plot(sizes, pvalues3, label="NOT(D indep S)", linestyle="dashed")
ax.plot(sizes, pvalues4, label="D indep S given L,B")
ax.plot(sizes, pvalues5, label="T indep B")
ax.plot(sizes, pvalues6, label="NOT(T indep B given D)", linestyle="dashed")
ax.tick_params(rotation=90)
ax.set_xlabel("data size")
ax.set_ylabel("pValue")
ax.legend(bbox_to_anchor=(0.15, 0.83, 0.7, 0.102), loc=3, ncol=2, mode="expand", borderaxespad=0.0)
rect = patches.Rectangle((0, 0), max(sizes), 0.05, linewidth=1, edgecolor="#FF8888", facecolor="#FF8888")
ax.add_patch(rect)
ax.annotate(
"Critical region",
xytext=(190000, 0.2),
xy=(190000, 0.05),
ha="right",
va="center",
arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=-0.15"),
bbox=dict(boxstyle="square", fc="w"),
)
ax.set_title("G2 pvalue=f(datasize)")
gnb.flow.add(fig)
gnb.flow.add(gnb.getBN(gum.fastBN("A->T->TorC->X;S->C->TorC->D<-B<-S")))
gnb.flow.new_line()

<pyagrum.lib.notebook.FlowLayout at 0x10e4c9010>

With BNLearner, you can also check the joint (condtional) log-likelihood in the base

bn
G dyspnoea dyspnoea tuberculosis tuberculosis tuberculos_or_cancer tuberculos_or_cancer tuberculosis->tuberculos_or_cancer smoking smoking bronchitis bronchitis smoking->bronchitis lung_cancer lung_cancer smoking->lung_cancer bronchitis->dyspnoea visit_to_Asia visit_to_Asia visit_to_Asia->tuberculosis positive_XraY positive_XraY lung_cancer->tuberculos_or_cancer tuberculos_or_cancer->dyspnoea tuberculos_or_cancer->positive_XraY
siz = 10000
gum.generateSample(bn, siz, "out/sample_score.csv", False)
learner = gum.BNLearner("out/sample_score.csv")
def affLL(learner, s1, s2=[]):
if len(s2) == 0:
print("{} : {}".format(s1, learner.logLikelihood(s1)))
else:
print("{}|{} : {}".format(s1, s2, learner.logLikelihood(s1, s2)))
def dsepByLL(learner, x, y, z): # is X indep of Y given Z ?
lxy_z = learner.logLikelihood([x, y], [z])
lx_z = learner.logLikelihood([x], [z])
ly_z = learner.logLikelihood([y], [z])
print("{} indep {} given {} : {}".format(x, y, z, lxy_z - lx_z - ly_z))
print("Condional Joint LogLikelihood")
affLL(learner, ["lung_cancer", "bronchitis", "smoking"])
affLL(learner, ["smoking"])
affLL(learner, ["lung_cancer", "bronchitis"], ["smoking"])
print("--------------")
print("LL indep test")
dsepByLL(learner, "lung_cancer", "bronchitis", "smoking")
dsepByLL(learner, "tuberculos_or_cancer", "bronchitis", "dyspnoea")
Condional Joint LogLikelihood
['lung_cancer', 'bronchitis', 'smoking'] : -21843.331556653047
['smoking'] : -9999.81966236849
['lung_cancer', 'bronchitis']|['smoking'] : -11843.511894284557
--------------
LL indep test
lung_cancer indep bronchitis given smoking : 1.65264482035127
tuberculos_or_cancer indep bronchitis given dyspnoea : 172.72461157616453

Evolution of conditional log-likelihood w.r.t the size of the database (in Asia)

Section titled “Evolution of conditional log-likelihood w.r.t the size of the database (in Asia)”
def consolidationIndepFromLL(bn, size, lindep, nbr=20):
"""
Using $nbr$ generated databases of size $size$ from the bn $bn$,
consolidate the logLikelihoos for a list $lindep$ of conditional independence to test.
return the list of consolidated pValues
"""
LL_cumul = [0.0] * len(lindep)
for i in range(nbr):
gum.generateSample(bn, size, "out/sample_score.csv", False)
learner = gum.BNLearner("out/sample_score.csv")
for i, (var1, var2, kno) in enumerate(lindep):
LL12 = learner.logLikelihood([var1, var2], kno)
LL1 = learner.logLikelihood([var1], kno)
LL2 = learner.logLikelihood([var2], kno)
LL_cumul[i] += (LL12 - LL1 - LL2) / size
return [p / nbr for p in LL_cumul]
sizes = [50, 100, 500, 1000, 2000, 5000, 10000, 20000, 50000, 100000, 200000]
LL1, LL2, LL3, LL4, LL5, LL6 = zip(
*[
consolidationIndepFromLL(
bn,
siz,
[
("visit_to_Asia", "smoking", ["tuberculos_or_cancer"]),
("visit_to_Asia", "smoking", []),
("dyspnoea", "smoking", []),
("dyspnoea", "smoking", ["lung_cancer", "bronchitis"]),
("tuberculosis", "bronchitis", []),
("tuberculosis", "bronchitis", ["dyspnoea"]),
],
)
for siz in sizes
]
)
%matplotlib inline
from pylab import *
import matplotlib.patches as patches
fig = figure(figsize=FIGSIZE)
ax = fig.add_subplot(1, 1, 1)
ax.plot(sizes, LL1, label="NOT(A indep S given TorC)", linestyle="dashed")
ax.plot(sizes, LL2, label="A indep S")
ax.plot(sizes, LL3, label="NOT(D indep S)", linestyle="dashed")
ax.plot(sizes, LL4, label="D indep S given C,B")
ax.plot(sizes, LL5, label="T indep B")
ax.plot(sizes, LL6, label="NOT(T indep B given D)", linestyle="dashed")
ax.tick_params(rotation=90)
ax.set_xlabel("data size")
ax.set_ylabel("LL/size")
ax.semilogy()
ax.legend(bbox_to_anchor=(0.15, 0.8, 0.8, 0.102), loc=3, ncol=3, mode="expand", borderaxespad=0.0)
ax.set_title("logLikelihood=f(datasize)")
gnb.flow.add(fig)
gnb.flow.add(gnb.getBN(gum.fastBN("A->T->TorC->X;S->C->TorC->D<-B<-S")))
gnb.flow.new_line();
gnb.flow.display()
PyAgrum inline image
G A A T T A->T X X D D TorC TorC TorC->X TorC->D C C C->TorC S S S->C B B S->B B->D T->TorC

PyAgrum inline image
G A A T T A->T X X D D TorC TorC TorC->X TorC->D C C C->TorC S S S->C B B S->B B->D T->TorC

PyAgrum inline image
G A A T T A->T X X D D TorC TorC TorC->X TorC->D C C C->TorC S S S->C B B S->B B->D T->TorC