from typing import List, Optional, Dict
import pandas as pd
import numpy as np
from lightwood.mixer.base import BaseMixer
from lightwood.ensemble.base import BaseEnsemble
from lightwood.api.types import PredictionArguments
from lightwood.data.encoded_ds import EncodedDs
from lightwood import dtype
from lightwood.helpers.general import evaluate_accuracy
from lightwood.helpers.numeric import is_nan_numeric
from lightwood.helpers.log import log
[docs]class ModeEnsemble(BaseEnsemble):
mixer_scores: Dict[str, float]
def __init__(self, target, mixers: List[BaseMixer], data: EncodedDs, dtype_dict: dict,
accuracy_functions, args: PredictionArguments, ts_analysis: Optional[dict] = None) -> None:
super().__init__(target, mixers, data)
self.mixer_scores = {}
if dtype_dict[target] not in (dtype.binary, dtype.categorical, dtype.tags):
raise Exception(
'This ensemble can only be used in classification problems! ' +
f'Got target dtype {dtype_dict[target]} instead!')
for _, mixer in enumerate(mixers):
score_dict = evaluate_accuracy(
data.data_frame,
mixer(data, args)['prediction'],
target,
accuracy_functions,
ts_analysis=ts_analysis
)
avg_score = np.mean(list(score_dict.values()))
log.info(f'Mixer: {type(mixer).__name__} got accuracy: {avg_score}')
if is_nan_numeric(avg_score):
avg_score = -pow(2, 63)
log.warning(f'Change the accuracy of mixer {type(mixer).__name__} to valid value: {avg_score}')
self.mixer_scores[f'__mdb_mixer_{type(mixer).__name__}'] = avg_score
def _pick_mode_highest_score(self, prediction: pd.Series):
"""If the predictions are unimodal, return the mode. If there are multiple modes, return the mode whose voting
mixers have the highest score."""
prediction_counts = prediction.value_counts()
# If there is a clear winner, i.e. only one prediction
if len(prediction_counts) == 1:
return prediction_counts.index[0]
counts = prediction_counts.values # how many times all predictions have appeared
max_count = np.max(counts) # how many times the most frequent predictions have apppeared
# most frequent predictions and how many times they appeared
modes = prediction_counts[prediction_counts == max_count]
modes_predictions = modes.index # most frequent predictions
# For each mode, get the sum of the scores of the predictors who voted for it
modes_predictions_scores = {}
for mode_prediction in modes_predictions:
voting_mixers_name = prediction[prediction == mode_prediction].index.tolist()
modes_predictions_scores[mode_prediction] = np.sum(
[self.mixer_scores[mixer_name] for mixer_name in voting_mixers_name])
# Return the mode with the maximum sum of accuracies
return max(modes_predictions_scores, key=modes_predictions_scores.get)
def __call__(self, ds: EncodedDs, args: PredictionArguments) -> pd.DataFrame:
predictions_df = pd.DataFrame()
for mixer in self.mixers:
predictions_df[f'__mdb_mixer_{type(mixer).__name__}'] = mixer(ds, args=args)['prediction']
mode_df = predictions_df.apply(func=self._pick_mode_highest_score, axis='columns')
return pd.DataFrame(mode_df, columns=['prediction'])