Source code for bamt.redef_HC

"""
**********************
Greedy Hill-Climbing
for Structure Learning
**********************

Code for Searching through the space of
possible Bayesian Network structures.

Various optimization procedures are employed,
from greedy search to simulated annealing, and
so on - mostly using scipy.optimize.

Local search - possible moves:
- Add edge
- Delete edge
- Invert edge

Strategies to improve Greedy Hill-Climbing:
- Random Restarts
    - when we get stuck, take some number of
    random steps and then start climbing again.
- Tabu List
    - keep a list of the K steps most recently taken,
    and say that the search cannt reverse (undo) any
    of these steps.
"""

from bamt.external.pyBN.classes.bayesnet import BayesNet
from bamt.external.pyBN.utils.graph import would_cause_cycle
from bamt.mi_entropy_gauss import mi_gauss
from bamt.redef_info_scores import log_lik_local, BIC_local, AIC_local


[docs] def hc( data, metric="MI", max_iter=200, debug=False, init_nodes=None, restriction=None, init_edges=None, remove_geo_edges=True, black_list=None, ): """ Greedy Hill Climbing search proceeds by choosing the move which maximizes the increase in fitness of the network at the current step. It continues until it reaches a point where there does not exist any feasible single move that increases the network fitness. It is called "greedy" because it simply does what is best at the current iteration only, and thus does not look ahead to what may be better later on in the search. For computational saving, a Priority Queue (python's heapq) can be used to maintain the best operators and reduce the complexity of picking the best operator from O(n^2) to O(nlogn). This works by maintaining the heapq of operators sorted by their delta score, and each time a move is made, we only have to recompute the O(n) delta-scores which were affected by the move. The rest of the operator delta-scores are not affected. For additional computational efficiency, we can cache the sufficient statistics for various families of distributions - therefore, computing the mutual information for a given family only needs to happen once. The possible moves are the following: - add edge - delete edge - invert edge Arguments --------- *data* : pd.DataFrame The data from which the Bayesian network structure will be learned. *metric* : a string Which score metric to use. Options: - AIC - BIC / MDL - LL (log-likelihood) *max_iter* : an integer The maximum number of iterations of the hill-climbing algorithm to run. Note that the algorithm will terminate on its own if no improvement is made in a given iteration. *debug* : boolean Whether to print(the scores/moves of the) algorithm as its happening. *init_nodes* : a list of initialize nodes (number of nodes according to the dataset) *restriction* : a list of 2-tuples For MMHC algorithm, the list of allowable edge additions. Returns ------- *bn* : a BayesNet object """ nrow = data.shape[0] ncol = data.shape[1] names = range(ncol) # INITIALIZE NETWORK W/ NO EDGES # maintain children and parents dict for fast lookups c_dict = dict([(n, []) for n in names]) p_dict = dict([(n, []) for n in names]) if init_edges: for edge in init_edges: c_dict[edge[0]].append(edge[1]) p_dict[edge[1]].append(edge[0]) bn = BayesNet(c_dict) mutual_information = mi_gauss if metric == "BIC": mutual_information = BIC_local if metric == "AIC": mutual_information = AIC_local if metric == "LL": mutual_information = log_lik_local data = data.values cache = dict() _iter = 0 improvement = True while improvement: improvement = False max_delta = 0 if debug: print("ITERATION: ", _iter) ### TEST ARC ADDITIONS ### for u in bn.nodes(): for v in bn.nodes(): if ( v not in c_dict[u] and u != v and not would_cause_cycle(c_dict, u, v) and len(p_dict[v]) != 3 ): # FOR MMHC ALGORITHM -> Edge Restrictions if ( (init_nodes is None or not (v in init_nodes)) and (restriction is None or (u, v) in restriction) and (black_list is None or not ((u, v) in black_list)) ): # SCORE FOR 'V' -> gaining a parent # without 'u' as parent old_cols = (v,) + tuple(p_dict[v]) if old_cols not in cache: cache[old_cols] = mutual_information(data[:, old_cols]) mi_old = cache[old_cols] new_cols = old_cols + (u,) # with'u' as parent if new_cols not in cache: cache[new_cols] = mutual_information(data[:, new_cols]) mi_new = cache[new_cols] delta_score = nrow * (mi_old - mi_new) if delta_score > max_delta: if debug: print("Improved Arc Addition: ", (u, v)) print("Delta Score: ", delta_score) max_delta = delta_score max_operation = "Addition" max_arc = (u, v) # ### TEST ARC DELETIONS ### for u in bn.nodes(): for v in bn.nodes(): if v in c_dict[u]: # SCORE FOR 'V' -> losing a parent old_cols = (v,) + tuple(p_dict[v]) # with 'u' as parent if old_cols not in cache: cache[old_cols] = mutual_information(data[:, old_cols]) mi_old = cache[old_cols] new_cols = tuple([i for i in old_cols if i != u]) if new_cols not in cache: cache[new_cols] = mutual_information(data[:, new_cols]) mi_new = cache[new_cols] delta_score = nrow * (mi_old - mi_new) if delta_score > max_delta: if init_edges is None: if debug: print("Improved Arc Deletion: ", (u, v)) print("Delta Score: ", delta_score) max_delta = delta_score max_operation = "Deletion" max_arc = (u, v) else: if (u, v) in init_edges: if remove_geo_edges: if debug: print("Improved Arc Deletion: ", (u, v)) print("Delta Score: ", delta_score) max_delta = delta_score max_operation = "Deletion" max_arc = (u, v) else: if debug: print("Improved Arc Deletion: ", (u, v)) print("Delta Score: ", delta_score) max_delta = delta_score max_operation = "Deletion" max_arc = (u, v) # ### TEST ARC REVERSALS ### for u in bn.nodes(): for v in bn.nodes(): if ( v in c_dict[u] and not would_cause_cycle(c_dict, v, u, reverse=True) and len(p_dict[u]) != 3 and (init_nodes is None or not (u in init_nodes)) and (restriction is None or (v, u) in restriction) and (black_list is None or not ((v, u) in black_list)) ): old_cols = (u,) + tuple(p_dict[v]) # without 'v' as parent if old_cols not in cache: cache[old_cols] = mutual_information(data[:, old_cols]) mi_old = cache[old_cols] new_cols = old_cols + (v,) # with 'v' as parent if new_cols not in cache: cache[new_cols] = mutual_information(data[:, new_cols]) mi_new = cache[new_cols] delta1 = -1 * nrow * (mi_old - mi_new) # SCORE FOR 'V' -> losing 'u' as parent old_cols = (v,) + tuple(p_dict[v]) # with 'u' as parent if old_cols not in cache: cache[old_cols] = mutual_information(data[:, old_cols]) mi_old = cache[old_cols] # without 'u' as parent new_cols = tuple([u for i in old_cols if i != u]) if new_cols not in cache: cache[new_cols] = mutual_information(data[:, new_cols]) mi_new = cache[new_cols] delta2 = nrow * (mi_old - mi_new) # COMBINED DELTA-SCORES delta_score = delta1 + delta2 if delta_score > max_delta: if init_edges is None: if debug: print("Improved Arc Reversal: ", (u, v)) print("Delta Score: ", delta_score) max_delta = delta_score max_operation = "Reversal" max_arc = (u, v) else: if (u, v) in init_edges: if remove_geo_edges: if debug: print("Improved Arc Reversal: ", (u, v)) print("Delta Score: ", delta_score) max_delta = delta_score max_operation = "Reversal" max_arc = (u, v) else: if debug: print("Improved Arc Reversal: ", (u, v)) print("Delta Score: ", delta_score) max_delta = delta_score max_operation = "Reversal" max_arc = (u, v) if max_delta != 0: improvement = True u, v = max_arc if max_operation == "Addition": if debug: print("ADDING: ", max_arc, "\n") c_dict[u].append(v) p_dict[v].append(u) elif max_operation == "Deletion": if debug: print("DELETING: ", max_arc, "\n") c_dict[u].remove(v) p_dict[v].remove(u) elif max_operation == "Reversal": if debug: print("REVERSING: ", max_arc, "\n") c_dict[u].remove(v) p_dict[v].remove(u) c_dict[v].append(u) p_dict[u].append(v) else: if debug: print("No Improvement on Iter: ", _iter) ### TEST FOR MAX ITERATION ### _iter += 1 if _iter > max_iter: if debug: print("Max Iteration Reached") break bn = BayesNet(c_dict) return bn