Source code for cohorts.histogram

'''This module implements histograms cohorts, e.g. EditsHistogram.
'''

import sys,logging
logger = logging.getLogger('Histogram cohorts')


try:
    import numpy as N
except:
    logger.error('Numpy not installed')
    


import settings
import utils

from cohorts.base import Cohort
from data import tables


[docs]class EditsHistogram(Cohort):
    '''The cohorts are based on the number of edits they have done in a given month. Implemented only for MongoDB.
    '''
    def __init__(self):

        # We don't take into consideration people who have 0 edits as the data is coded sparse and we don't have
        # this information
        # Note that len(self.cohorts) has to be equal the number of bins in the numpy.array
        # self.cohorts = [1,3,5,7,10,20,40,60,80,100,200,500,1000,5000,10000,'>10000 edits']
        self.cohorts = [1,5,10,50,100,500,1000,'>1000 edits']
        '''Cohort definition
        '''
        self.cohort_labels = self.cohorts[:]
        self.cohort_labels[0] = '%s edit'%(self.cohort_labels[0])
        for i in range(1,len(self.cohorts)-1):
            self.cohort_labels[i] = '%s-%s edits'%(self.cohorts[i-1]+1,self.cohorts[i])
        # self.cohort_labels = ['<%s edits'%(e) for e in self.cohorts]
        '''Cohort labels
        '''                        
        Cohort.__init__(self)
    
[docs]    def getIndex(self, edits):
        '''
        Returns the index of the cohort 
        '''
        for i,e in enumerate(self.cohorts):
            if edits <= e:
                return i

        return len(self.cohorts)-1


    def processMongoDocument(self,editor):

        if 'edit_count' not in editor:        
            return

        editor_id = editor['user_id']


        if utils.isBot(editor_id):
            return

        for year,edity in editor['edit_count'].items():        
            for month,editm in edity.items():  
                # extract year and month 20xxxx  
                ym = '%s%02d'%(year,int(month))
                
                time_index = self.time_stamps_index.get(ym,None)
                if time_index is None:
                    continue

                # cohort index depends on the aggregate of the namespaces
                nedits = 0                        
                
                for ns,ecount in editm.items():                           
                                                                                    
                    if ns in self.NS:
                        # count edits
                        nedits +=  ecount
                
                if nedits > 0:
                    # Sparse representation. Only collecting >0 edits because it is possible that an editor has no edits in that namespace, but edits in other namespaces.
                    cohorts_index = self.getIndex(nedits)

                    # increment the histogram bin for that editor and month
                    # note: as the data is sparse, it means we don't count the number of people with zero edits in that month
                    self.data['editsHistogram'][cohorts_index,time_index] += 1

   
    def initData(self):

        self.data['editsHistogram'] = N.zeros((len(self.cohorts), len(self.time_stamps)))

        self.initDataDescription()

    def initDataDescription(self):

        self.data_description['editsHistogram'] = {     'title' : 'Histogram of the number of edits', 
                                                        'ylabel' : 'Number of Editors'}


[docs]    def getColor(self, i):
        '''
        Returns a color based on the index of the cohort i
        '''
        return self.colors[i]

[docs]    def colorbarTicksAndLabels(self,ncolors):
        '''Returns ticks and labels for the colorbar of a WikiPride visualization
        '''

        nlabels = ncolors

        ticks = N.linspace(0, (1.-1./nlabels), nlabels) +0.5/nlabels
        skip = [ int(i) for i in N.linspace(0,len(self.cohorts)-1,nlabels) ]        
        labels = [self.cohort_labels[i] for i in skip]
            
        return ticks,labels


[docs]class EditorActivity(Cohort):
    '''The cohorts are based on the number of edits they have done in a given month. It uses a table where the values are aggregated for all namespaces.
    '''
    def __init__(self):

        # We don't take into consideration people who have 0 edits as the data is coded sparse and we don't have
        # this information        
        self.cohorts = [1,5,50,100,500,1000,'>1000 edits']
        '''Cohort definition
        '''
        self.cohort_labels = self.cohorts[:]
        self.cohort_labels[0] = '%s edit'%(self.cohort_labels[0])
        for i in range(1,len(self.cohorts)-1):
            self.cohort_labels[i] = '%s-%s edits'%(self.cohorts[i-1]+1,self.cohorts[i])
        # self.cohort_labels = ['<%s edits'%(e) for e in self.cohorts]
        '''Cohort labels
        '''                        

        self.sqlQuery = 'SELECT * FROM %s;'%tables.EDITOR_YEAR_MONTH
        '''The SQL query returns edit information for each editor for each ym she has edited.'''


        Cohort.__init__(self)
    
[docs]    def getIndex(self, edits):
        '''
        Returns the index of the cohort 
        '''
        for i,e in enumerate(self.cohorts):
            if edits <= e:
                return i

        return len(self.cohorts)-1

   
    def initData(self):

        self.data['added'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
        self.data['removed'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
        self.data['net'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
        self.data['edits'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
        self.data['editors'] = N.zeros((len(self.cohorts), len(self.time_stamps)))

        self.initDataDescription()


[docs]    def initDataDescription(self):
        '''Initialize the self.data_description dictionary with information used for plotting. 
        '''
        self.data_description['added'] = {  'title' : 'Megabytes added by editor activity ( %s, namespaces:All)'%('no bots' if self.nobots else 'including bots'), \
                                            'ylabel': 'Megabytes',\
                                            'ytickslabel' : lambda x : '%d'%(x/1e6) }
        self.data_description['removed'] = {  'title' : 'Megabytes removed by editor activity ( %s, namespaces:All)'%('no bots' if self.nobots else 'including bots'), \
                                            'ylabel': 'Megabytes',\
                                            'ytickslabel' : lambda x : '%d'%(x/1e6) }

        self.data_description['net'] = {  'title' : 'Megabytes Added-Removed by editor activity ( %s, namespaces:All)'%('no bots' if self.nobots else 'including bots'), \
                                            'ylabel': 'Megabytes',\
                                            'ytickslabel' : lambda x : '%d'%(x/1e6) }

        self.data_description['edits'] = {  'title' : 'Number of edits by editor activity ( %s, namespaces:All)'%('no bots' if self.nobots else 'including bots'), \
                                            'ylabel': 'Edits' }

        self.data_description['editors'] = {  'title' : 'Active editor histogram (%s, namespaces:All)'%('no bots' if self.nobots else 'including bots'), \
                                             'ylabel': 'Number of Editors' }
                                                        

    def processSQLrow(self,row):
        # try:
        editor_id = row['user_id']

        if utils.isBot(editor_id):
            return

        year = row['rev_year']
        month = row['rev_month']

        ym = '%d%02d'%(year,month)

        time_index = self.time_stamps_index.get(ym,None)
        if time_index is None:
            return


        edits = 0
        if row['add_edits'] is not None:
            edits += int(row['add_edits'])
        if row['remove_edits'] is not None:
            edits += int(row['remove_edits'])
        if row['noop_edits'] is not None:
            edits += int(row['noop_edits'])

        cohorts_index = self.getIndex(edits)

        self.data['editors'][cohorts_index,time_index] += 1

        if row['len_added'] is not None:
            self.data['added'][cohorts_index,time_index] += int(row['len_added'])
        if row['len_removed'] is not None:    
            self.data['removed'][cohorts_index,time_index] += -int(row['len_removed'])
        if row['len_added'] is not None and row['len_removed'] is not None:
            self.data['net'][cohorts_index,time_index] += int(row['len_added']) + int(row['len_removed'])
        
        self.data['edits'][cohorts_index,time_index] += edits

        # except:
        #     raise Exception('row:\n%s'%row)

[docs]    def getColor(self, i):
        '''
        Returns a color based on the index of the cohort i
        '''
        return self.colors[i]

[docs]    def colorbarTicksAndLabels(self,ncolors):
        '''Returns ticks and labels for the colorbar of a WikiPride visualization
        '''

        nlabels = ncolors

        ticks = N.linspace(0, (1.-1./nlabels), nlabels) +0.5/nlabels
        skip = [ int(i) for i in N.linspace(0,len(self.cohorts)-1,nlabels) ]        
        labels = [self.cohort_labels[i] for i in skip]
            
        return ticks,labels
    
    
[docs]    def linePlots(self,dest):
        '''Graphs for editor activity histogram cohort include

        * Number of editors by activity
        * Number of edits by activity
        * Bytes added by activity
        * Bytes added per editor
        * Bytes added per edit
        * Edits per editor
        * The first year of one-year cohorts in one plot (x-axis is age, not time)

        
        '''
        logger.info('Creating line plots for %s'%self)
        
        def graph(d,title,ylabel,ylog=False,loc=None):
            fig = None 
            for i in range(d.shape[0]):
                
                label = self.cohort_labels[i]

                fig = self.addLine(data=d[i,:],fig=fig,label=label)
            
            self.saveFigure(name=title, fig=fig, dest=dest, title=title,ylabel=ylabel,ylog=ylog,legendpos=loc)

        
        added = self.data['added']
        edits = self.data['edits']
        editors = self.data['editors']

        graph(d=editors,title="Number of editors by activity", ylabel="# of editors",loc=2)
        graph(d=edits,title="Number of edits by activity", ylabel="# of edits",loc=2)
        graph(d=added,title="Bytes added by activity", ylabel="Bytes",loc=2)

        editors[editors==0]=1
        edits[edits==0]=1

        graph(d=added/editors,title="Bytes added per editor", ylabel="Bytes (log scale)",ylog=True,loc=2)
        graph(d=edits/editors,title="Edits per editor", ylabel="# of edits (log scale)",ylog=True,loc=1)
        graph(d=added/edits,title="Bytes added per edit", ylabel="Bytes",loc=1)

        # The first year of one-year cohorts in one plot (x-axis is age, not time)

        editors = self.data['editors']
        
        l = 12  
        fig = None      
        for i in range(0,(added.shape[1]/l)*l,l):
            e = edits[(i):(i+l-1),:].sum(axis=0)
            e[e==0] = 1
            data = added[(i):(i+l-1),:].sum(axis=0)/e
            fig = self.addLine(data=data,fig=fig,label='%s-%s months active'%(i,(i+l-1)))

        

[docs]class NewEditorActivity(Cohort):
    '''The cohorts are based on the number of edits they have done in a given month.
    '''
    def __init__(self,period=3  ):

        # We don't take into consideration people who have 0 edits as the data is coded sparse and we don't have
        # this information
        # Note that len(self.cohorts) has to be equal the number of bins in the numpy.array
        # self.cohorts = [1,3,5,7,10,20,40,60,80,100,200,500,1000,5000,10000,'>10000 edits']
        self.cohorts = [1,5,10,50,100,500,1000,'>1000 edits']
        '''Cohort definition
        '''
        self.cohort_labels = self.cohorts[:]
        self.cohort_labels[0] = '%s edit'%(self.cohort_labels[0])
        for i in range(1,len(self.cohorts)-1):
            self.cohort_labels[i] = '%s-%s edits'%(self.cohorts[i-1]+1,self.cohorts[i])
        # self.cohort_labels = ['<%s edits'%(e) for e in self.cohorts]
        '''Cohort labels
        '''             
        self.period = period
        '''The number of month an editor is considered new
        '''
        self.old_user_id = None
        '''The user_id of the previously encountered editor as we iterate through the table
        '''
        self.lastym = None
        '''The ym at the end of the `period` months after the first edit of an editor 
        '''
        self.firstedit = None
        self.fe_index = None

        self.edits = 0
        self.added = 0
        self.removed = 0
        self.net = 0

        

        Cohort.__init__(self)
    
[docs]    def getIndex(self, edits):
        '''
        Returns the index of the cohort 
        '''
        for i,e in enumerate(self.cohorts):
            if edits <= e:
                return i

        return len(self.cohorts)-1

    def processSQLrow(self,row):
        # try:
        editor_id = row['user_id']

        if utils.isBot(editor_id):
            return


        if editor_id != self.old_user_id:
            '''When encountering a new editor, we extract the firstedit and the last ym of interest
            '''
            if self.old_user_id is not None:
                # we just finished aggregating information for an editor
                
                #getting bin of the number of edits the editor has done
                cohorts_index = self.getIndex(self.edits)

                self.data['editors'][cohorts_index,self.fe_index] += 1
                self.data['added'][cohorts_index,self.fe_index] = self.added
                self.data['removed'][cohorts_index,self.fe_index] = self.removed
                self.data['net'][cohorts_index,self.fe_index] = self.net
                self.data['edits'][cohorts_index,self.fe_index] = self.edits


            # initializing info for current editor
            self.old_user_id = editor_id

            year = row['first_edit_year']
            month = row['first_edit_month']

            self.firstedit = '%d%02d'%(year,month)
            self.fe_index = self.time_stamps_index.get(self.firstedit,None)

            if self.fe_index is None:
                return
            

            (m,y) = ((month+self.period)%12,year+(month+self.period)/12)
            self.lastym = '%d%02d'%(y,m)

            self.edits = 0
            self.added = 0
            self.removed = 0
            self.net = 0


        if self.fe_index is None:
            return

        year = row['rev_year']
        month = row['rev_month']

        ym = '%d%02d'%(year,month)

        if ym > self.lastym:
            return


        
        if row['add_edits'] is not None:
            self.edits += int(row['add_edits'])
        if row['remove_edits'] is not None:
            self.edits += int(row['remove_edits'])
        if row['noop_edits'] is not None:
            self.edits += int(row['noop_edits'])

        

        if row['len_added'] is not None:
            self.added += int(row['len_added'])
        if row['len_removed'] is not None:    
            self.removed += -int(row['len_removed'])
        if row['len_added'] is not None and row['len_removed'] is not None:
            self.net += int(row['len_added']) + int(row['len_removed'])
        

    def initData(self):

        self.data['added'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
        self.data['removed'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
        self.data['net'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
        self.data['edits'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
        self.data['editors'] = N.zeros((len(self.cohorts), len(self.time_stamps)))

        self.initDataDescription()


[docs]    def initDataDescription(self):
        '''Initialize the self.data_description dictionary with information used for plotting. 
        '''
        self.data_description['added'] = {  'title' : 'Megabytes added by new editor activity ( <%s months, %s, namespaces:All)'%(self.period, 'no bots' if self.nobots else 'including bots'), \
                                            'ylabel': 'Megabytes',\
                                            'ytickslabel' : lambda x : '%d'%(x/1e6) }
        self.data_description['removed'] = {  'title' : 'Megabytes removed by new editor activity ( <%s months, %s, namespaces:All)'%(self.period, 'no bots' if self.nobots else 'including bots'), \
                                            'ylabel': 'Megabytes',\
                                            'ytickslabel' : lambda x : '%d'%(x/1e6) }

        self.data_description['net'] = {  'title' : 'Megabytes Added-Removed by new editor activity ( <%s months, %s, namespaces:All)'%(self.period, 'no bots' if self.nobots else 'including bots'), \
                                            'ylabel': 'Megabytes',\
                                            'ytickslabel' : lambda x : '%d'%(x/1e6) }

        self.data_description['edits'] = {  'title' : 'Number of edits by new editor activity ( <%s months, %s, namespaces:All)'%(self.period,'no bots' if self.nobots else 'including bots'), \
                                            'ylabel': 'Edits' }

        self.data_description['editors'] = {  'title' : 'Edits Histogram for the first %s month of an edit activity (%s, namespaces:All)'%(self.period, 'no bots' if self.nobots else 'including bots'), \
                                             'ylabel': 'Number of Editors' }
   



[docs]    def getColor(self, i):
        '''
        Returns a color based on the index of the cohort i
        '''
        return self.colors[i]

[docs]    def colorbarTicksAndLabels(self,ncolors):
        '''Returns ticks and labels for the colorbar of a WikiPride visualization
        '''

        nlabels = ncolors

        ticks = N.linspace(0, (1.-1./nlabels), nlabels) +0.5/nlabels
        skip = [ int(i) for i in N.linspace(0,len(self.cohorts)-1,nlabels) ]        
        labels = [self.cohort_labels[i] for i in skip]
            
        return ticks,labels
Navigation

Quick search

Source code for cohorts.histogram

Navigation