Source code for base.models

# -*- coding: utf-8 -*-

from pickle import HIGHEST_PROTOCOL as pickle_HIGHEST_PROTOCOL
import numpy as np

from django.db import models
from django.contrib.contenttypes.models import ContentType
from django.contrib.contenttypes.fields import (GenericForeignKey,
                                                GenericRelation, )
from django.utils.translation import ugettext_lazy as _
from django.core.exceptions import ValidationError

from picklefield.fields import PickledObjectField
from jsonfield import JSONField


[docs]class StatisticalModel(models.Model): """ Metaclass for Learning Techniques. It defines the common interface so the Techniques can be "plugged" along the framework and the applications. """ SM_TYPE_GENERAL = 0 SM_TYPE_SUPERVISED = 1 SM_TYPE_UNSUPERVISED = 2 #: Choices for Statistical Model Type SM_TYPE_CHOICES = ( (SM_TYPE_GENERAL, "General"), (SM_TYPE_SUPERVISED, "Classification"), (SM_TYPE_UNSUPERVISED, "Regression"), ) #: Allowed Keywords for Threshold actions ACTIONS_KEYWORDS = [":recalculate", ] #: Unique Name, meant to be used for retrieveing the object. name = models.CharField( "Name", unique=True, max_length=100 ) #: Type of the Statistical Model sm_type = models.SmallIntegerField( "Statistical Technique Type", choices=SM_TYPE_CHOICES, default=SM_TYPE_GENERAL, blank=True, null=True ) #: If the System or Technique has results - i.e. Clustering has_results = models.BooleanField( "Has Results?", default=True ) #: Field for storing metadata (results and information related to #: internal tasks) of the System or Technique metadata = JSONField( "Metadata", default={}, blank=True, null=True ) #: This is where the main object of the Engine resides. engine_object = PickledObjectField( "Engine Object", protocol=pickle_HIGHEST_PROTOCOL, blank=True, null=True ) #: The timestamp of the Engine Object creation or last update engine_object_timestamp = models.DateTimeField( "Engine Object Timestamp", blank=True, null=True ) #: Number of times to run the Engine inference engine_meta_iterations = models.SmallIntegerField( "Engine Meta Iterations", default=1 ) #: Engine Maximum iterations safeguard engine_iterations = models.SmallIntegerField( "Engine Iterations (Max)", blank=True, null=True ) #: Where to store the results (if applicable) results_storage = models.CharField( "Results Storage", max_length=100, blank=True, null=True ) #: Automation: Internal Counter counter = models.IntegerField( "Internal Counter", default=0, blank=True, null=True ) #: Automation: Internal Counter Threshold counter_threshold = models.IntegerField( "Internal Counter Threshold", blank=True, null=True ) #: Automation: Actions to be run when the threshold is met. threshold_actions = models.CharField( "Threshold actions", max_length=200, blank=True, null=True ) #: Fields, Attributes or Callables from where to retrieve the data #: for the System or Technique data_columns = GenericRelation( "base.DataColumn", related_query_name="%(app_label)s_%(class)ss", ) #: If Inference has been performed on the System or Technique is_inferred = models.BooleanField( "Is Inferred?", default=False ) class Meta: abstract = True verbose_name = "Statistical Technique" verbose_name_plural = "Statistical Techniques" def __str__(self): return("[ST|{0}]".format(self.name)) # -> Public API
[docs] def get_engine_object(self, reconstruct=False, save=True): """ Returns the main object provided by the Statistical Engine. It is responsible for initializing the Engine object if not exists - or is indicated by the "reconstruct" kwarg - and save it to the "engine_object" field. """ raise NotImplementedError("A Technique should implement this method")
[docs] def reset_engine_object(self, save=True): """ Resets the Engine-related fields. (engine_object, engine_object_timestamp, metadata and is_inferred). """ self.engine_object = None self.engine_object_timestamp = None self.metadata = {} self.is_inferred = False if save: self.save() return(True)
[docs] def perform_inference(self, recalculate=False, save=True): """ Performs the Inference with the Statistical Engine and updates the Engine Object """ raise NotImplementedError("A Technique should implement this method")
[docs] def reset_inference(self, save=True): """ Base inference resetting (defaults to reset_engine_object()) """ return(self.reset_engine_object(save=save))
def get_data(self): """ Returns a list of R^d points, represented as list of length d, constructed from the Node's columns. """ data = {} columns = self.data_columns.all().order_by("position") if len(columns) == 0: raise ValueError(_("No columns defined for the Model / Technique")) # As they may not be from the same model, the can't be retrieved # straight from the ORM for column in columns: colname = "{0}.{1}".format(column.ref_model, column.ref_column) data[colname] = column.ref_model.model_class().objects.values_list( column.ref_column, flat=True) # and the len of the columns shouls be checked lengths = [len(data[col]) for col in data] h = lengths[0] if any([h != t for t in lengths[1:]]): raise ValidationError( {"ref_column": _("Columns lengths does not match.")}) # Construct the list data_list = np.stack([data[col] for col in data], axis=-1) return(data_list) def get_results(self): raise NotImplementedError("A Technique should implement this method") def store_results(self, reset=False): """ Stores the results of the inference of a System or Technique in a Model's field (to be generalized for other storage options). Note that it will update the results using the default ordering of the Model in which will be stored. """ if self.has_results and self.results_storage: self._store_results(reset=reset) return(True) else: return(False)
[docs] def parse_and_run_threshold_actions(self): """ Parses and runs the thresholds actions. """ if self.counter_threshold: if self.counter_threshold <= self.counter: self.counter = 0 actions = self.threshold_actions.split(" ") for action in actions: if action == ":recalculate": self.perform_inference(recalculate=True) return(True) else: return(False)
[docs] def rotate_metadata(self): """ Rotates metadata from "current_inference" to "previous_inference" if it is not empty. """ if self.metadata["current_inference"] != {}: self.metadata["previous_inference"] = \ self.metadata["current_inference"] self.metadata["current_inference"] = {}
# -> Django Models API def clean(self): if self.results_storage: # Check the validity of results_storage field try: rs = self._parse_results_storage() except Exception as e: msg = e.args[0] raise ValidationError({'results_storage': _( 'Invalid format or storage engine: {}'.format(msg) )}) if rs["storage"] == "dmf": try: model_class = ContentType.objects.get( app_label=rs["attrs"]["app"], model=rs["attrs"]["model"].lower() ).model_class() except Exception as e: msg = e.args[0] raise ValidationError({'results_storage': _( 'Error getting the model: {}'.format(msg) )}) try: getattr(model_class, rs["attrs"]["field"]) except Exception as e: msg = e.args[0] raise ValidationError({'results_storage': _( 'Error accessing the field: {}'.format(msg) )}) # Check threshold_actions keywords are valid if self.threshold_actions: for action in self.threshold_actions.split(" "): if action not in self.ACTIONS_KEYWORDS: raise ValidationError({'threshold_actions': _( 'Unrecognized action: {}'.format(action) )}) def save(self, *args, **kwargs): """ Base save() processing """ # Initialize metadata field if corresponds if self.metadata == {}: self.metadata["current_inference"] = {} self.metadata["previous_inference"] = {} # Runs threshold actions if corresponds self.parse_and_run_threshold_actions() super(StatisticalModel, self).save(*args, **kwargs) # -> Internal API def _parse_results_storage(self): storage, attrs = self.results_storage.split(":", 1) if storage == "dmf": app, model, field = attrs.split(".") return( { "storage": storage, "attrs": { "app": app, "model": model, "field": field } } ) else: raise ValueError(_( '"{}" engine is not implemented.'.format(storage) )) def _store_results(self, reset=False): results = self.get_results() # results_storage already validated rs = self._parse_results_storage() if rs["storage"] == "dmf": app, model, field = (rs["attrs"]["app"], rs["attrs"]["model"], rs["attrs"]["field"]) model_class = ContentType.objects.get( app_label=app, model=model.lower() ).model_class() if reset: model_class.objects.all().update(**{field: None}) else: # Prevent from new records model_objects = model_class.objects.all()[:len(results)] # This could be done with django-bulk-update # but for not adding another dependency: for index, model_object in enumerate(model_objects): setattr(model_object, field, results[index]) model_object.save()
[docs]class SupervisedLearningTechnique(StatisticalModel): """ Metaclass for Supervised Learning Techniques. """ SL_TYPE_CLASSIFICATION = 0 SL_TYPE_REGRESSION = 1 #: Choices for Supervised Learning Type SL_TYPE_CHOICES = ( (SL_TYPE_CLASSIFICATION, "Classification"), (SL_TYPE_REGRESSION, "Regression"), ) #: Supervised Learning Type sl_type = models.SmallIntegerField( "Supervised Learning Type", choices=SL_TYPE_CHOICES, default=SL_TYPE_CLASSIFICATION, blank=True, null=True ) #: Field or Attribute containing the labels of the data labels_column = models.CharField( "Labels' Column", max_length=100, blank=True, null=True, help_text=_(( 'Format: app_label.model.attribute' )) ) # -> Pre-training #: Django Model containing the pre-training dataset in the #: "app_label.model" format, i.e. "examples.SFPTEnron" pretraining = models.CharField( "Pre-Training dataset", max_length=100, blank=True, null=True, help_text=( 'Django Model containing the pre-training dataset in the' '"app_label.model" format, i.e. "examples.SFPTEnron"' ) ) # -> Cross Validation #: Enable Cross Validation (k-Folded) cv_is_enabled = models.BooleanField( "Cross Validation is Enabled?", default=True, help_text=( 'Enable Cross Validation' ) ) #: Quantity of Folds to be used in Cross Validation cv_folds = models.SmallIntegerField( "Cross Validation Folds", blank=True, null=True, help_text=( 'Quantity of Folds to be used in Cross Validation' ) ) #: Metric to be evaluated in Cross Validation cv_metric = models.CharField( "Cross Validation Metric", max_length=20, blank=True, null=True, help_text=( 'Metric to be evaluated in Cross Validation' ) ) class Meta: abstract = True verbose_name = "Supervised Learning Technique" verbose_name_plural = "Supervised Learning Techniques" app_label = "supervised_learning" def __init__(self, *args, **kwargs): kwargs["sm_type"] = self.SM_TYPE_SUPERVISED super(SupervisedLearningTechnique, self).__init__(*args, **kwargs) def __str__(self): return("[SL|{0}]".format(self.name)) # -> Public API def predict(self, sl_input): raise NotImplementedError("A Technique should implement this method")
[docs] def get_labels(self): """ Returns a list of labels of the data available for the model. """ if self.labels_column: app, model, attribute = self.labels_column.split(".") model_class = ContentType.objects.get( app_label=app, model=model.lower() ).model_class() labels = model_class.objects.values_list(attribute, flat=True) return(labels) else: return(None)
[docs] def get_pretraining_data(self): """ Returns the pre-training data """ raise NotImplementedError("A Technique should implement this method")
[docs] def get_pretraining_labels(self): """ Returns the pre-training labels """ raise NotImplementedError("A Technique should implement this method")
[docs] def perform_cross_validation(self, data=None, labels=None, update_metadata=False): """ Performs Cross Validation with the current state of the model on the available data or in a given set. """ raise NotImplementedError("A Technique should implement this method")
# -> Django Models API def clean(self): if self.labels_column: # Check the validity of the Labels Column try: app, model, attribute = self.labels_column.split(".") except Exception: raise ValidationError({'labels_column': _( 'Invalid format' )}) try: model_class = ContentType.objects.get( app_label=app, model=model.lower() ).model_class() except Exception: raise ValidationError({'labels_column': _( 'The Reference Model must be a valid Django Model' )}) try: getattr(model_class, attribute) except Exception: raise ValidationError({'labels_column': _( 'The column must be a valid attribute of ' 'the {} model'.format(model_class._meta.verbose_name) )})
[docs]class UnsupervisedLearningTechnique(StatisticalModel): """ Metaclass for Supervised Learning Techniques. """ UL_TYPE_CLUSTERING = 0 UL_TYPE_OTHER = 1 UL_TYPE_CHOICES = ( (UL_TYPE_CLUSTERING, "Clustering"), (UL_TYPE_OTHER, "Other"), ) ul_type = models.SmallIntegerField( "Unsupervised Learning Type", choices=UL_TYPE_CHOICES, default=UL_TYPE_CLUSTERING, blank=True, null=True ) class Meta: abstract = True verbose_name = "Unsupervised Learning Technique" verbose_name_plural = "Unsupervised Learning Techniques" def __init__(self, *args, **kwargs): kwargs["sm_type"] = self.SM_TYPE_UNSUPERVISED super(UnsupervisedLearningTechnique, self).__init__(*args, **kwargs) def __str__(self): return("[UL|{0}]".format(self.name)) # -> Public API def assign(self, sl_input): raise NotImplementedError("A Technique should implement this method")
class DataColumn(models.Model): """ A dimension / axis / column of Technique / Model. """ # -> Model or Technique reference content_type = models.ForeignKey( ContentType, on_delete=models.CASCADE ) object_id = models.PositiveIntegerField() content_object = GenericForeignKey( 'content_type', 'object_id' ) # -> Data reference ref_model = models.ForeignKey( ContentType, on_delete=models.CASCADE, related_name="%(app_label)s_%(class)ss" ) ref_column = models.CharField( "Reference Column", max_length=100 ) position = models.SmallIntegerField( "Position", blank=True, null=True ) class Meta: verbose_name = "Data Column" verbose_name_plural = "Data Columns" unique_together = [("content_type", "object_id", "ref_model", "ref_column"), ("content_type", "object_id", "position")] app_label = "base" def __str__(self): return("{0} | {1} - {2}".format(self.content_type, self.ref_model, self.ref_column)) def clean(self): # Check the validity of the Reference Column try: mc = self.ref_model.model_class() except Exception: raise ValidationError({'ref_model': _( 'The Reference Model must be a valid Django Model' )}) try: getattr(mc, self.ref_column) except Exception: raise ValidationError({'ref_column': _( 'The column must be a valid attribute of ' 'the ' + self.ref_model.name + ' model' )})