Source code for data.report

"""This module defines the content of a report, which consists of the following at the moment. 

* Community roles
    * User  
    * Administrators
* Cohort trends
    * Age Cohorts            
        * More than 1 edit 
        * More than 5 edit 
        * More than 100 edit 
        * Less than 100 edits 
    * New editors
    * Histogram cohorts
    * Namespaces
* User lists
    * Most active editors

"""
import os, errno
import logging
logger = logging.getLogger('Report')

import utils
import settings

from data import userlists


[docs]class ReportItem(): """ A report consists of a collection of report items. A report item consists of a cohort instance and methods to generate the data and the plots. """ def __init__(self, cohort, dest): self.cohort = cohort '''Cohort instance ''' self.relDest = dest '''Relative path to the destination directory'''
[docs] def createDirectory(self,base): '''Creates the directory if it doesn't exist already. The `base` directory is joined with the relative destination directory and returned. :arg base: base directory (e.g. settings.datadirectory or settings.wikipridedirectory) :returns: absolute path ''' p = os.path.join(base,self.relDest) try: os.makedirs(p) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST: pass else: raise return p
[docs] def loadData(self): '''Loads the data from disk if available ''' for varName in self.cohort.data_description.keys(): self.cohort.loadDataFromDisk(varName=varName,destination=os.path.join(REPORTDATA,self.relDest))
[docs] def freeData(self): '''Frees the data in hope of reducing the memory usage of the process. ''' for varName in self.cohort.data.keys(): del self.cohort.data[varName]
[docs] def generateData(self): '''Generates and saves the cohort data. Calls the :meth:`.aggregateDataFromSQL` method from the :class:`.Cohort` instance passed as argument. The collected data matrices are stored in the :attr:`.Cohort.data` attribute. The data matrices are saved as txt files in the data destination directory.''' self.cohort.aggregateDataFromSQL(verbose=True) dest = self.createDirectory(base=REPORTDATA) self.cohort.saveDataToDisk(destination=dest) self.freeData()
[docs] def generateCSV(self): '''Stores a simple csv file in a format used by the javascript `dygraphs <http://dygraphs.com/>`_ library. ''' self.loadData() dest = self.createDirectory(base=REPORTCSV) self.cohort.saveDataToCSV(destination=dest)
[docs] def generateVisualizations(self,varNames, **kargs): '''For the variables names in `varNames`, produces the WikiPride graphs using :meth:`.wikiPride` (e.g. `added`, `editors`, ...). If the cohort defines `linePlots`, they are also generated. :arg kargs: arguments passed directly to :meth:`.wikiPride`. E.g. `flip=True`, `percentage=False`. :arg varNames: list of str, containing the names of the variables for which wikipride should be produced. ''' self.loadData() dest = self.createDirectory(base=REPORTGRAPHS) for v in varNames: self.cohort.wikiPride(varName=v,dest=dest,**kargs) self.cohort.linePlots(dest=dest) self.freeData() #hackz
try: #Absolute path directories REPORTDATA = os.path.join(settings.reportdirectory,'data') REPORTGRAPHS = os.path.join(settings.reportdirectory,'graphs') REPORTLISTS = os.path.join(settings.reportdirectory,'lists') REPORTCSV = os.path.join(settings.reportdirectory,'csv') #Relative path directory tree for the report COMMUNITY = "Community_roles" COHORTTREND = "Cohort_trends" AGE = os.path.join(COHORTTREND,"Age_cohorts") ABS_AGE = os.path.join(COHORTTREND,"Absolute_age") ABS_MORE1 = os.path.join(ABS_AGE,"More_than_1_edit") ABS_MORE5 = os.path.join(ABS_AGE,"More_than_5_edits") ABS_MORE100 = os.path.join(ABS_AGE,"More_than_100_edits") ABS_LESS100 = os.path.join(ABS_AGE,"Less_than_100_edits") REL_AGE = os.path.join(COHORTTREND,"Relative_age") REL_MORE1 = os.path.join(REL_AGE,"More_than_1_edit") REL_MORE5 = os.path.join(REL_AGE,"More_than_5_edits") REL_MORE100 = os.path.join(REL_AGE,"More_than_100_edits") REL_LESS100 = os.path.join(REL_AGE,"Less_than_100_edits") NEWEDITORS = os.path.join(COHORTTREND,"New_editors") HISTOGRAM = os.path.join(COHORTTREND,"Histogram_cohorts") NAMESPACES = os.path.join(COHORTTREND,"Namespaces") USERLISTS = "User_lists" # Report items from cohorts import age from cohorts import histogram from cohorts import simple absMore1 = ReportItem(cohort=age.AbsoluteAgeAllNamespaces(minedits = 1), dest=ABS_MORE1) absMore5 = ReportItem(cohort=age.AbsoluteAgeAllNamespaces(minedits = 5), dest=ABS_MORE5) absMore100 = ReportItem(cohort=age.AbsoluteAgeAllNamespaces(minedits = 100), dest=ABS_MORE100) absLess100 = ReportItem(cohort=age.AbsoluteAgeAllNamespaces(minedits = 1,maxedits = 100), dest=ABS_LESS100) relMore1 = ReportItem(cohort=age.RelativeAgeAllNamespaces(minedits = 1), dest=REL_MORE1) relMore5 = ReportItem(cohort=age.RelativeAgeAllNamespaces(minedits = 5), dest=REL_MORE5) relMore100 = ReportItem(cohort=age.RelativeAgeAllNamespaces(minedits = 100), dest=REL_MORE100) relLess100 = ReportItem(cohort=age.RelativeAgeAllNamespaces(minedits = 1,maxedits = 100), dest=REL_LESS100) editorActivity = ReportItem(cohort=histogram.EditorActivity(), dest=HISTOGRAM) nsCohort = ReportItem(cohort=simple.NameSpaces(), dest=NAMESPACES) newEditors = ReportItem(cohort=simple.NewEditors(), dest=NEWEDITORS) except: logger.error("Exception when creating report structure. Likely settings haven't been read. Failure very likely...") pass
[docs]def processData(): '''The aggregation of the cohort data requires that :func:`data.preprocessing.process` has been executed and the data thus preprocessed. The :func:`data.cohortdata.processData` method will use the report definition in :mod:`.report` to create a directory structure that contains the data of the cohort defitintions described below. The data is stored in the form of `numpy` matrices. ''' logger.info('Aggregating the cohort data for %swiki'%settings.language) utils.setFilterBots(settings.filterbots,userlists.BOT_LIST_FILE) # aggregate and save cohort data absMore1.generateData() absMore5.generateData() absMore100.generateData() absLess100.generateData() relMore1.generateData() relMore5.generateData() relMore100.generateData() relLess100.generateData() editorActivity.generateData() nsCohort.generateData() newEditors.generateData()
[docs]def processCSV(): '''The aggregation of the cohort data requires that :func:`data.preprocessing.process` has been executed and the data thus preprocessed. The :func:`data.cohortdata.processData` method will use the report definition in :mod:`.report` to create a directory structure that contains the data of the cohort defitintions described below. The data is stored in the form of `numpy` matrices. ''' logger.info('Saving dygraph CSV data files for %swiki'%settings.language) utils.setFilterBots(settings.filterbots,userlists.BOT_LIST_FILE) # aggregate and save cohort data absMore1.generateCSV() absMore5.generateCSV() absMore100.generateCSV() absLess100.generateCSV() relMore1.generateCSV() relMore5.generateCSV() relMore100.generateCSV() relLess100.generateCSV() editorActivity.generateCSV() nsCohort.generateCSV() newEditors.generateCSV()
[docs]def processReport(): '''Creates a set of graphs which requires that :func:`data.report.processData` has been executed and the data thus aggregated. The data is loaded from disk. ''' stdVars = ['added','edits','editors'] absMore1.generateVisualizations(varNames=stdVars) absMore5.generateVisualizations(varNames=stdVars) absMore100.generateVisualizations(varNames=stdVars) absLess100.generateVisualizations(varNames=stdVars) relMore1.generateVisualizations(varNames=stdVars, flip=True) relMore5.generateVisualizations(varNames=stdVars, flip=True) relMore100.generateVisualizations(varNames=stdVars, flip=True) relLess100.generateVisualizations(varNames=stdVars, flip=True) editorActivity.generateVisualizations(varNames=stdVars) nsCohort.generateVisualizations(varNames=['added','edits']) newEditors.generateVisualizations(varNames=['editors'], percentage=False,colorbar=False)