Source code for examples.migrations.0015_sfptenron_sfptyoutube

# -*- coding: utf-8 -*-
# Generated by Django 1.11.5 on 2017-12-13 08:02
from __future__ import unicode_literals
import os
import tarfile
import urllib.request
import random
import zipfile
import csv
import io

from django.db import migrations, models


CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
ENRON_MAILS_FILE_NAME = os.path.join(CURRENT_DIR, "enron1.tar.gz")
ENRON_MAILS_FILE_URL = ("http://www.aueb.gr/users/ion/data/enron-spam/"
                        "preprocessed/enron1.tar.gz")
YOUTUBE_COMMENTS_FILE_NAME = os.path.join(CURRENT_DIR,
                                          "YouTube-Spam-Collection-v1.zip")
YOUTUBE_COMMENTS_FILE_URL = ("https://archive.ics.uci.edu/ml/"
                             "machine-learning-databases/00380/"
                             "YouTube-Spam-Collection-v1.zip")


def confirm(question):
    """
    https://gist.github.com/garrettdreyfus/8153571
    """
    reply = str(input(' -> ' + question + ' (Y/n): ')).lower().strip()
    if reply == 'y' or reply == '':
        return True
    elif reply == 'n':
        return False
    else:
        return confirm("Mmmm... Please enter")


[docs]def download_and_process_pretrain_data_files(apps, schema_editor):
    """
    Forward Operation: Downloads if neccesary the sample data and populates
    Pre-Train Models.
    """
    SFPTEnron = apps.get_model("examples", "SFPTEnron")
    SFPTYoutube = apps.get_model("examples", "SFPTYoutube")
    random.seed(1234567)

    # -> Download datasets if not exist
    if (not os.path.exists(ENRON_MAILS_FILE_NAME) or
            not os.path.exists(YOUTUBE_COMMENTS_FILE_NAME)):
        if confirm("Proceed to download pre-training datasets?"):
            if not os.path.exists(ENRON_MAILS_FILE_NAME):
                print("    Downloading Enron mails dataset...")
                urllib.request.urlretrieve(
                    ENRON_MAILS_FILE_URL,
                    ENRON_MAILS_FILE_NAME
                )
            if not os.path.exists(YOUTUBE_COMMENTS_FILE_NAME):
                print("     Downloading Youtube comments dataset...")
                urllib.request.urlretrieve(
                    YOUTUBE_COMMENTS_FILE_URL,
                    YOUTUBE_COMMENTS_FILE_NAME
                )
    # -> Process the Enron mails file
    with tarfile.open(name=ENRON_MAILS_FILE_NAME, mode="r:gz") as tfile:
        for member in tfile.getmembers():
            if member.isfile() and "Summary" not in member.name:
                message = tfile.extractfile(member).read()
                SFPTEnron.objects.create(
                    content=message.decode('raw_unicode_escape'),
                    is_spam=("spam" in member.name),
                )
    # -> Process Youtube comments file
    with zipfile.ZipFile(YOUTUBE_COMMENTS_FILE_NAME) as zfile:
        yt_files = [file for file in zfile.filelist
                    if "MACOSX" not in file.filename]
        for yt_file in yt_files:
            with zfile.open(yt_file.filename) as csvfile:
                csvfile_sio = io.StringIO(
                    csvfile.read().decode('raw_unicode_escape')
                )
                reader = csv.DictReader(csvfile_sio)
                for row in reader:
                    SFPTYoutube.objects.create(
                        content=row['CONTENT'],
                        is_spam=(row['CLASS'] == '1')
                    )


def confirm_deletion_pretrain_data_files(apps, schema_editor):
    """
    Backward Operation: Remove the file if deemed necessary, no need to remove
    the objects as the table will be removed.
    """
    if not confirm(
            "Leave downloaded pre-train datasets files for the future?"):
        os.remove(ENRON_MAILS_FILE_NAME)
        os.remove(YOUTUBE_COMMENTS_FILE_NAME)


class Migration(migrations.Migration):

    dependencies = [
        ('examples', '0014_missing_ui_meta'),
    ]

    operations = [
        migrations.CreateModel(
            name='SFPTEnron',
            fields=[
                ('id', models.AutoField(
                    auto_created=True, primary_key=True, serialize=False,
                    verbose_name='ID')),
                ('content', models.TextField(verbose_name='Content')),
                ('is_spam', models.BooleanField(
                    default=False, verbose_name='Is Spam?')),
            ],
            options={
                'verbose_name': 'Spam Filter Pre-training: Enron Email Data',
                'verbose_name_plural': ('Spam Filter Pre-trainings: '
                                        'Enron Emails Data'),
            },
        ),
        migrations.CreateModel(
            name='SFPTYoutube',
            fields=[
                ('id', models.AutoField(
                    auto_created=True, primary_key=True, serialize=False,
                    verbose_name='ID')),
                ('content', models.TextField(verbose_name='Content')),
                ('is_spam', models.BooleanField(
                    default=False, verbose_name='Is Spam?')),
            ],
            options={
                'verbose_name': ('Spam Filter Pre-training: '
                                 'Youtube Comment Data'),
                'verbose_name_plural': ('Spam Filter Pre-trainings: '
                                        'Youtube Comments Data'),
            },
        ),
        migrations.RunPython(
            download_and_process_pretrain_data_files,
            confirm_deletion_pretrain_data_files
        ),

    ]