Source code for data.report

"""This module defines the content of a report, which consists of the following at the moment. 

* Community roles
    * User  
    * Administrators
* Cohort trends
    * Age Cohorts            
        * More than 1 edit 
        * More than 5 edit 
        * More than 100 edit 
        * Less than 100 edits 
    * New editors
    * Histogram cohorts
    * Namespaces
* User lists
    * Most active editors

"""
import os, errno
import logging
logger = logging.getLogger('Report')

import utils
import settings

from data import userlists


[docs]class ReportItem():
    """
    A report consists of a collection of report items. A report item consists of a cohort instance and methods to generate the data and the plots.
    """
    def __init__(self, cohort, dest):

        self.cohort = cohort
        '''Cohort instance
        '''

        self.relDest = dest
        '''Relative path to the destination directory'''

[docs]    def createDirectory(self,base):
        '''Creates the directory if it doesn't exist already. The `base` directory is joined with the relative destination directory and returned.

        :arg base: base directory (e.g. settings.datadirectory or settings.wikipridedirectory)
        :returns: absolute path
        '''
        p = os.path.join(base,self.relDest)

        try:
            os.makedirs(p)
        except OSError as exc: # Python >2.5
            if exc.errno == errno.EEXIST:
                pass
            else: raise
        return p

[docs]    def loadData(self):
        '''Loads the data from disk if available
        '''    
        for varName in self.cohort.data_description.keys():  
            self.cohort.loadDataFromDisk(varName=varName,destination=os.path.join(REPORTDATA,self.relDest))

[docs]    def freeData(self):
        '''Frees the data in hope of reducing the memory usage of the process.        
        '''  
        for varName in self.cohort.data.keys():  
            del self.cohort.data[varName]
        



[docs]    def generateData(self):
        '''Generates and saves the cohort data. Calls the :meth:`.aggregateDataFromSQL` method from the :class:`.Cohort` instance passed as argument. The collected data matrices are stored in the :attr:`.Cohort.data` attribute. The data matrices are saved as txt files in the data destination directory.'''
        
        self.cohort.aggregateDataFromSQL(verbose=True)   
        
        dest = self.createDirectory(base=REPORTDATA)
        self.cohort.saveDataToDisk(destination=dest)

        
        self.freeData()


[docs]    def generateCSV(self):
        '''Stores a simple csv file in a format used by the javascript `dygraphs <http://dygraphs.com/>`_ library.    
        '''
        
        self.loadData()

        dest = self.createDirectory(base=REPORTCSV)

        self.cohort.saveDataToCSV(destination=dest)


[docs]    def generateVisualizations(self,varNames, **kargs):
        '''For the variables names in `varNames`, produces the WikiPride graphs using :meth:`.wikiPride` (e.g. `added`, `editors`, ...). If the cohort defines `linePlots`, they are also generated.

        :arg kargs: arguments passed directly to :meth:`.wikiPride`. E.g. `flip=True`, `percentage=False`.
        :arg varNames: list of str, containing the names of the variables for which wikipride should be produced.
        '''
        
        self.loadData()

        dest = self.createDirectory(base=REPORTGRAPHS)

        for v in varNames:
            self.cohort.wikiPride(varName=v,dest=dest,**kargs)
        
        self.cohort.linePlots(dest=dest)

        self.freeData()


#hackz
try:

    #Absolute path directories

    REPORTDATA = os.path.join(settings.reportdirectory,'data')
    REPORTGRAPHS = os.path.join(settings.reportdirectory,'graphs')
    REPORTLISTS = os.path.join(settings.reportdirectory,'lists')
    REPORTCSV = os.path.join(settings.reportdirectory,'csv')

    #Relative path directory tree for the report

    COMMUNITY = "Community_roles" 

    COHORTTREND =  "Cohort_trends"
    AGE = os.path.join(COHORTTREND,"Age_cohorts")
    ABS_AGE = os.path.join(COHORTTREND,"Absolute_age")
    ABS_MORE1 = os.path.join(ABS_AGE,"More_than_1_edit")
    ABS_MORE5 = os.path.join(ABS_AGE,"More_than_5_edits")
    ABS_MORE100 = os.path.join(ABS_AGE,"More_than_100_edits")
    ABS_LESS100 = os.path.join(ABS_AGE,"Less_than_100_edits")
    REL_AGE = os.path.join(COHORTTREND,"Relative_age")
    REL_MORE1 = os.path.join(REL_AGE,"More_than_1_edit")
    REL_MORE5 = os.path.join(REL_AGE,"More_than_5_edits")
    REL_MORE100 = os.path.join(REL_AGE,"More_than_100_edits")
    REL_LESS100 = os.path.join(REL_AGE,"Less_than_100_edits")

    NEWEDITORS = os.path.join(COHORTTREND,"New_editors")
    HISTOGRAM = os.path.join(COHORTTREND,"Histogram_cohorts")
    NAMESPACES = os.path.join(COHORTTREND,"Namespaces")

    USERLISTS = "User_lists" 


    # Report items
        
    from cohorts import age
    from cohorts import histogram
    from cohorts import simple

    absMore1 = ReportItem(cohort=age.AbsoluteAgeAllNamespaces(minedits = 1), dest=ABS_MORE1)
    absMore5 = ReportItem(cohort=age.AbsoluteAgeAllNamespaces(minedits = 5), dest=ABS_MORE5)
    absMore100 = ReportItem(cohort=age.AbsoluteAgeAllNamespaces(minedits = 100), dest=ABS_MORE100)
    absLess100 = ReportItem(cohort=age.AbsoluteAgeAllNamespaces(minedits = 1,maxedits = 100), dest=ABS_LESS100)


    relMore1 = ReportItem(cohort=age.RelativeAgeAllNamespaces(minedits = 1), dest=REL_MORE1)
    relMore5 = ReportItem(cohort=age.RelativeAgeAllNamespaces(minedits = 5), dest=REL_MORE5)
    relMore100 = ReportItem(cohort=age.RelativeAgeAllNamespaces(minedits = 100), dest=REL_MORE100)
    relLess100 = ReportItem(cohort=age.RelativeAgeAllNamespaces(minedits = 1,maxedits = 100), dest=REL_LESS100)


    editorActivity = ReportItem(cohort=histogram.EditorActivity(), dest=HISTOGRAM)

    nsCohort = ReportItem(cohort=simple.NameSpaces(), dest=NAMESPACES)
    newEditors = ReportItem(cohort=simple.NewEditors(), dest=NEWEDITORS)

except:
    logger.error("Exception when creating report structure. Likely settings haven't been read. Failure very likely...")
    pass
    
[docs]def processData():
    '''The aggregation of the cohort data requires that  :func:`data.preprocessing.process` has been executed and the data thus preprocessed. The :func:`data.cohortdata.processData` method will use the report definition in :mod:`.report` to create a directory structure that contains the data of the cohort defitintions described below. The data is stored in the form of `numpy` matrices.                
    '''
    logger.info('Aggregating the cohort data for %swiki'%settings.language)

    utils.setFilterBots(settings.filterbots,userlists.BOT_LIST_FILE)

    # aggregate and save cohort data
    absMore1.generateData()
    absMore5.generateData()
    absMore100.generateData()
    absLess100.generateData()
    
    relMore1.generateData()
    relMore5.generateData()
    relMore100.generateData()
    relLess100.generateData()

    editorActivity.generateData()

    nsCohort.generateData()
    
    newEditors.generateData()


[docs]def processCSV():
    '''The aggregation of the cohort data requires that  :func:`data.preprocessing.process` has been executed and the data thus preprocessed. The :func:`data.cohortdata.processData` method will use the report definition in :mod:`.report` to create a directory structure that contains the data of the cohort defitintions described below. The data is stored in the form of `numpy` matrices.                
    '''
    logger.info('Saving dygraph CSV data files for %swiki'%settings.language)

    utils.setFilterBots(settings.filterbots,userlists.BOT_LIST_FILE)

    # aggregate and save cohort data
    absMore1.generateCSV()
    absMore5.generateCSV()
    absMore100.generateCSV()
    absLess100.generateCSV()
    
    relMore1.generateCSV()
    relMore5.generateCSV()
    relMore100.generateCSV()
    relLess100.generateCSV()

    editorActivity.generateCSV()

    nsCohort.generateCSV()
    
    newEditors.generateCSV()




[docs]def processReport():
    '''Creates a set of graphs which requires that :func:`data.report.processData` has been executed and the data thus aggregated. The data is loaded from disk.
    '''    

    stdVars = ['added','edits','editors']

    absMore1.generateVisualizations(varNames=stdVars)
    absMore5.generateVisualizations(varNames=stdVars)
    absMore100.generateVisualizations(varNames=stdVars)
    absLess100.generateVisualizations(varNames=stdVars)

    relMore1.generateVisualizations(varNames=stdVars, flip=True)
    relMore5.generateVisualizations(varNames=stdVars, flip=True)
    relMore100.generateVisualizations(varNames=stdVars, flip=True)
    relLess100.generateVisualizations(varNames=stdVars, flip=True)

    editorActivity.generateVisualizations(varNames=stdVars)

    nsCohort.generateVisualizations(varNames=['added','edits'])

    newEditors.generateVisualizations(varNames=['editors'], percentage=False,colorbar=False)
Navigation

Quick search

Source code for data.report

Navigation