Source code for examples.migrations.0015_sfptenron_sfptyoutube

# -*- coding: utf-8 -*-
# Generated by Django 1.11.5 on 2017-12-13 08:02
from __future__ import unicode_literals
import os
import tarfile
import urllib.request
import random
import zipfile
import csv
import io

from django.db import migrations, models


CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
ENRON_MAILS_FILE_NAME = os.path.join(CURRENT_DIR, "enron1.tar.gz")
ENRON_MAILS_FILE_URL = ("http://www.aueb.gr/users/ion/data/enron-spam/"
                        "preprocessed/enron1.tar.gz")
YOUTUBE_COMMENTS_FILE_NAME = os.path.join(CURRENT_DIR,
                                          "YouTube-Spam-Collection-v1.zip")
YOUTUBE_COMMENTS_FILE_URL = ("https://archive.ics.uci.edu/ml/"
                             "machine-learning-databases/00380/"
                             "YouTube-Spam-Collection-v1.zip")


def confirm(question):
    """
    https://gist.github.com/garrettdreyfus/8153571
    """
    reply = str(input(' -> ' + question + ' (Y/n): ')).lower().strip()
    if reply == 'y' or reply == '':
        return True
    elif reply == 'n':
        return False
    else:
        return confirm("Mmmm... Please enter")


[docs]def download_and_process_pretrain_data_files(apps, schema_editor): """ Forward Operation: Downloads if neccesary the sample data and populates Pre-Train Models. """ SFPTEnron = apps.get_model("examples", "SFPTEnron") SFPTYoutube = apps.get_model("examples", "SFPTYoutube") random.seed(1234567) # -> Download datasets if not exist if (not os.path.exists(ENRON_MAILS_FILE_NAME) or not os.path.exists(YOUTUBE_COMMENTS_FILE_NAME)): if confirm("Proceed to download pre-training datasets?"): if not os.path.exists(ENRON_MAILS_FILE_NAME): print(" Downloading Enron mails dataset...") urllib.request.urlretrieve( ENRON_MAILS_FILE_URL, ENRON_MAILS_FILE_NAME ) if not os.path.exists(YOUTUBE_COMMENTS_FILE_NAME): print(" Downloading Youtube comments dataset...") urllib.request.urlretrieve( YOUTUBE_COMMENTS_FILE_URL, YOUTUBE_COMMENTS_FILE_NAME ) # -> Process the Enron mails file with tarfile.open(name=ENRON_MAILS_FILE_NAME, mode="r:gz") as tfile: for member in tfile.getmembers(): if member.isfile() and "Summary" not in member.name: message = tfile.extractfile(member).read() SFPTEnron.objects.create( content=message.decode('raw_unicode_escape'), is_spam=("spam" in member.name), ) # -> Process Youtube comments file with zipfile.ZipFile(YOUTUBE_COMMENTS_FILE_NAME) as zfile: yt_files = [file for file in zfile.filelist if "MACOSX" not in file.filename] for yt_file in yt_files: with zfile.open(yt_file.filename) as csvfile: csvfile_sio = io.StringIO( csvfile.read().decode('raw_unicode_escape') ) reader = csv.DictReader(csvfile_sio) for row in reader: SFPTYoutube.objects.create( content=row['CONTENT'], is_spam=(row['CLASS'] == '1') )
def confirm_deletion_pretrain_data_files(apps, schema_editor): """ Backward Operation: Remove the file if deemed necessary, no need to remove the objects as the table will be removed. """ if not confirm( "Leave downloaded pre-train datasets files for the future?"): os.remove(ENRON_MAILS_FILE_NAME) os.remove(YOUTUBE_COMMENTS_FILE_NAME) class Migration(migrations.Migration): dependencies = [ ('examples', '0014_missing_ui_meta'), ] operations = [ migrations.CreateModel( name='SFPTEnron', fields=[ ('id', models.AutoField( auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), ('content', models.TextField(verbose_name='Content')), ('is_spam', models.BooleanField( default=False, verbose_name='Is Spam?')), ], options={ 'verbose_name': 'Spam Filter Pre-training: Enron Email Data', 'verbose_name_plural': ('Spam Filter Pre-trainings: ' 'Enron Emails Data'), }, ), migrations.CreateModel( name='SFPTYoutube', fields=[ ('id', models.AutoField( auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), ('content', models.TextField(verbose_name='Content')), ('is_spam', models.BooleanField( default=False, verbose_name='Is Spam?')), ], options={ 'verbose_name': ('Spam Filter Pre-training: ' 'Youtube Comment Data'), 'verbose_name_plural': ('Spam Filter Pre-trainings: ' 'Youtube Comments Data'), }, ), migrations.RunPython( download_and_process_pretrain_data_files, confirm_deletion_pretrain_data_files ), ]