Source code for data.preprocessing
'''
Starting with the Wikimedia SQL database schema, this module creates a set of tables that will be used to aggregate the cohort trends.
'''
import os
import logging
logger = logging.getLogger('Preprocessing')
import settings
from data.tables import *
from data.userlists import *
from db import sql
[docs]def dropTable(tablename):
"""Drops a SQL table in the user database
:arg tablename: str, name of the table
"""
try:
logger.info('Dropping %s'%tablename)
cur = sql.getCursor()
cur.execute("DROP TABLE IF EXISTS %s;"%tablename)
except:
pass
[docs]def tableExists(tablename):
"""Returns True if the table exists in the user database
:arg tablename: str, name of the table
"""
cur = sql.getCursor()
cur.execute("show tables from %s like '%s';"%(settings.sqluserdb, tablename.split('.')[1]))
if cur.fetchone() is None:
return False
else:
return True
[docs]def createTable(query,tablename):
"""Create a SQL table in the user database
:arg tablename: str, name of the table
:arg query: str, query to execute
"""
try:
if settings.sqldroptables:
dropTable(tablename)
else:
# logger.info('Table %s not dropped.'tablename)
pass
if not tableExists(tablename):
cur = sql.getCursor()
logger.info('Creating %s table'%tablename)
cur.execute(query)
logger.info('Finished creating %s table'%tablename)
else:
logger.info('Table %s exists already! Do nothing'%tablename)
except:
logger.exception("Could not create table %s"%tablename)
[docs]def createIndex(query,tablename):
"""Create an index on a SQL table in the user database
:arg tablename: str, name of the table
:arg query: str, query to execute
"""
try:
cur = sql.getCursor()
cur.execute(query)
logger.info("Created indexes on %s"%tablename)
except:
logger.warning("Could not create index on %s. Possibly it already exists"%tablename)
[docs]def executeCommand(command,comment):
"""Exports a SQL table into a file
:arg command: str, the command used to export the
:arg comment: str, comment for logging stream
"""
try:
logger.info(comment)
os.system(command)
except:
logger.info('Failed executing command: %s'%command)
[docs]def process():
"""Creates the auxiliary SQL tables on the user database.
.. warning:
This can take a long time. Especially for larger Wikipedias. For the English Wikipedia, it will take over a week :(
"""
if settings.language in ['en','de']:
logger.warning('YOU ARE ATTEMPTING TO RUN THE PREPROCESSING ON THE ENGLISH OR GERMAN WIKIPEDIA. HOPEFULLY YOU ARE PATIENT, THIS WILL TAKE A WHILE!')
else:
logger.warning('Be patient, this can take a long time. I hope you used the screen command...')
logger.info('Preprocessing data for %swiki'%settings.language)
# Create the user database if it doesn't exist
logger.info('Creating Database %s'%settings.sqluserdb)
cur = sql.getCursor()
cur.execute(CREATE_USER_DATABASE)
# CREATE TABLES AND INDEXES
createTable(CREATE_USER_COHORTS,USER_COHORT)
createIndex(INDEX_USER_COHORTS,USER_COHORT)
createTable(CREATE_REV_LEN_CHANGED,REV_LEN_CHANGED)
createIndex(INDEX_REV_LEN_CHANGED,REV_LEN_CHANGED)
createTable(CREATE_EDITOR_YEAR_MONTH,EDITOR_YEAR_MONTH)
createTable(CREATE_EDITOR_YEAR_MONTH_NAMESPACE,EDITOR_YEAR_MONTH_NAMESPACE)
# not creating the 'day' table, it can be used for time series analysis, it is not useful for cohort visualizations.
# createTable(CREATE_EDITOR_YEAR_MONTH_DAY_NAMESPACE,EDITOR_YEAR_MONTH_DAY_NAMESPACE)
createTable(CREATE_TIME_YEAR_MONTH_NAMESPACE,TIME_YEAR_MONTH_NAMESPACE)
# not creating the 'day' table, it can be used for time series analysis, it is not useful for cohort visualizations.
# createTable(CREATE_TIME_YEAR_MONTH_DAY_NAMESPACE,TIME_YEAR_MONTH_DAY_NAMESPACE)
createTable(CREATE_BOT_LIST,BOT_LIST)
createIndex(INDEX_BOT_LIST,BOT_LIST)
executeCommand(EXPORT_BOT_LIST,'Exporting bot list for cohort analysis')