Source code for cohorts.age

'''This module implements age cohorts, AbsoluteAge and RelativeAge.
'''

import sys,logging
logger = logging.getLogger('Age cohorts')

try:
    import numpy as N
except:
    logger.error('Numpy not installed')
    


import settings
import utils


from cohorts.base import Cohort
from data import tables


[docs]class Age(Cohort): '''A abstract class for for an age cohort. ''' def __init__(self): if self.cohorts is None or self.cohort_labels is None: logger.error("self.cohorts or self.cohort_labels not properly defined") # raise Exception("self.cohorts or self.cohort_labels not properly defined") Cohort.__init__(self) def initData(self): self.data['added'] = N.zeros((len(self.cohorts), len(self.time_stamps))) self.data['removed'] = N.zeros((len(self.cohorts), len(self.time_stamps))) self.data['net'] = N.zeros((len(self.cohorts), len(self.time_stamps))) self.data['edits'] = N.zeros((len(self.cohorts), len(self.time_stamps))) self.data['editors'] = N.zeros((len(self.cohorts), len(self.time_stamps))) self.initDataDescription()
[docs] def initDataDescription(self): '''Initialize the self.data_description dictionary with additional information ''' self.data_description['added'] = { 'title' : 'Megabytes added ( %s, namespaces:%s)'%('no bots' if self.nobots else 'including bots', ','.join(self.NS) if len(self.NS)<16 else 'all'), \ 'ylabel': 'Megabytes',\ 'ytickslabel' : lambda x : '%d'%(x/1e6) } self.data_description['removed'] = { 'title' : 'Megabytes removed ( %s, namespaces:%s)'%('no bots' if self.nobots else 'including bots', ','.join(self.NS) if len(self.NS)<16 else 'all'), \ 'ylabel': 'Megabytes',\ 'ytickslabel' : lambda x : '%d'%(x/1e6) } self.data_description['net'] = { 'title' : 'Megabytes Added-Removed ( %s, namespaces:%s)'%('no bots' if self.nobots else 'including bots', ','.join(self.NS) if len(self.NS)<16 else 'all'), \ 'ylabel': 'Megabytes',\ 'ytickslabel' : lambda x : '%d'%(x/1e6) } self.data_description['edits'] = { 'title' : 'Number of edits ( %s, namespaces:%s)'%('no bots' if self.nobots else 'including bots', ','.join(self.NS) if len(self.NS)<16 else 'all'), \ 'ylabel': 'Edits' } self.data_description['editors'] = { 'title' : 'Number of editors active ( %s, namespaces:%s)'%('no bots' if self.nobots else 'including bots', ','.join(self.NS) if len(self.NS)<16 else 'all'), \ 'ylabel': 'Editors' }
[docs]class AbsoluteAgePerMonth(Age): '''A cohort is the group of people that have started editing in the same month. ''' def __init__(self): self.cohorts = [int(i) for i in range(0,len(settings.time_stamps))] '''Cohort definition ''' self.cohort_labels = ['%s-%s'%(settings.time_stamps[i][:4],settings.time_stamps[i][4:]) for i in self.cohorts] '''Cohort labels ''' self.old_user_id = None '''The user_id of the previously encountered editor as we iterate through the table ''' Age.__init__(self) def processSQLrow(self,row): try: editor_id = row['user_id'] if utils.isBot(editor_id): return year = row['rev_year'] month = row['rev_month'] ns = str(row['namespace']) ym = '%d%02d'%(year,month) time_index = self.time_stamps_index.get(ym,None) if time_index is None: return firstedit = '%d%02d'%(row['first_edit_year'],row['first_edit_month']) fe_index = self.time_stamps_index.get(firstedit,None) if fe_index is None: return cohorts_index = self.getIndex(fe_index) if ns in self.NS: if row['len_added'] is not None: self.data['added'][cohorts_index,time_index] += int(row['len_added']) if row['len_removed'] is not None: self.data['removed'][cohorts_index,time_index] += -int(row['len_removed']) if row['len_removed'] is not None and row['len_removed'] is not None: self.data['net'][cohorts_index,time_index] += int(row['len_added']) + int(row['len_removed']) if row['len_removed'] is not None and row['remove_edits'] is not None: self.data['edits'][cohorts_index,time_index] += int(row['add_edits'])+int(row['remove_edits']) if editor_id != self.old_user_id: #counting the editor we encountered self.data['editors'][cohorts_index,time_index] += 1 self.old_user_id = editor_id except: raise Exception('row:\n%s'%row)
[docs] def getIndex(self, fe): ''' Returns the index of the cohort, which is identical to the time index of the first edit ''' return fe
[docs] def colorbarTicksAndLabels(self,ncolors): '''Returns ticks and labels for the colorbar of a WikiPride visualization ''' nlabels = ncolors+1 # too many dates are unreadable if nlabels > 15: nlabels = 15 ticks = N.linspace(0, 1., nlabels) # +1./(nlabels-1)*1/2 skip = [ int(i) for i in N.linspace(0,len(self.cohorts)-1,nlabels) ] labels = [self.cohort_labels[i] for i in skip] return ticks,labels
[docs]class RelativeAgePerMonth(Age): '''A cohort is the group of people that have the same age at the time of an edit. During the first month of editing, a contributor will be in the 1-month old cohort, then he switches to the 2-month cohort and so forth. ''' def __init__(self): self.cohorts = [int(i) for i in range(0,len(settings.time_stamps))] '''Cohort definition ''' self.cohort_labels = ['%s month old'% i for i in self.cohorts] '''Cohort labels ''' self.old_user_id = None '''The user_id of the previously encountered editor as we iterate through the table ''' Age.__init__(self) def processSQLrow(self,row): editor_id = row['user_id'] if utils.isBot(editor_id): return year = row['rev_year'] month = row['rev_month'] ns = str(row['namespace']) ym = '%d%02d'%(year,month) time_index = self.time_stamps_index.get(ym,None) if time_index is None: return firstedit = '%d%02d'%(row['first_edit_year'],row['first_edit_month']) fe_index = self.time_stamps_index.get(firstedit,None) if fe_index is None: return cohorts_index = self.getIndex(time_index, fe_index) if ns in self.NS: if row['len_added'] is not None: self.data['added'][cohorts_index,time_index] += int(row['len_added']) if row['len_removed'] is not None: self.data['removed'][cohorts_index,time_index] += -int(row['len_removed']) if row['len_removed'] is not None and row['len_removed'] is not None: self.data['net'][cohorts_index,time_index] += int(row['len_added']) + int(row['len_removed']) if row['len_removed'] is not None and row['remove_edits'] is not None: self.data['edits'][cohorts_index,time_index] += int(row['add_edits'])+int(row['remove_edits']) if editor_id != self.old_user_id: #counting the editor we encountered self.data['editors'][cohorts_index,time_index] += 1 self.old_user_id = editor_id
[docs] def getIndex(self,ti,fe): ''' Returns the index of the cohort (i.e. the relative age of the editor) from the time index of the edit and time index of the first edit ''' return ti-fe
[docs] def colorbarTicksAndLabels(self,ncolors): '''Returns ticks and labels for the colorbar of a WikiPride visualization ''' nlabels = ncolors+1 # too many dates are unreadable if nlabels > 15: nlabels = 15 ticks = N.linspace(0, 1., nlabels) # +1./(nlabels-1)*1/2 skip = [ int(i) for i in N.linspace(0,len(self.cohorts)-1,nlabels) ] labels = [self.cohort_labels[i] for i in skip] return ticks,labels
[docs]class RelativeAgePerDay(Age): '''A cohort is the group of people that have the same age at the time of an edit. ''' def __init__(self): ts_month,ts_month_index = utils.create_time_stamps_month() self.cohorts = [int(i) for i in range(0,len(ts_month))] '''Cohort definition ''' self.cohort_labels = ['%s days old'% i for i in self.cohorts] '''Cohort labels ''' Age.__init__(self) def processSQLrow(self,row): editor_id = row['user_id'] if utils.isBot(editor_id): return year = row['rev_year'] month = row['rev_month'] day = row['rev_day'] ns = str(row['namespace']) ymd = '%d%02d%02d'%(year,month,day) time_index = self.time_stamps_index.get(ymd,None) if time_index is None: return fe = str(row['first_edit_year']) firstedit = fe[:8] # time.strptime(fe,"%Y%m%d%H%M%S") # firstedit = '%d%02d%02d'%(fe[0:4],fe[4:6],fe[6:8]) fe_index = self.time_stamps_index.get(firstedit,None) if fe_index is None: return cohorts_index = self.getIndex(time_index, fe_index) if ns in self.NS: self.data['addedPerDay'][cohorts_index,time_index] += int(row['len_added']) self.data['removedPerDay'][cohorts_index,time_index] += -int(row['len_removed']) self.data['netPerDay'][cohorts_index,time_index] += int(row['len_added']) + int(row['len_removed']) self.data['editsPerDay'][cohorts_index,time_index] += int(row['add_edits'])+int(row['remove_edits'])
[docs] def getIndex(self,ti,fe): ''' Returns the index of the cohort (i.e. the relative age of the editor) from the time index of the edit and time index of the first edit ''' i = (ti-fe)%30 if i > len(self.cohorts)-1: return len(self.cohorts)-1 return i
[docs] def colorbarTicksAndLabels(self,ncolors): '''Returns ticks and labels for the colorbar of a WikiPride visualization ''' nlabels = ncolors+1 # too many dates are unreadable if nlabels > 15: nlabels = 15 ticks = N.linspace(0, 1., nlabels) # +1./(nlabels-1)*1/2 skip = [ int(i) for i in N.linspace(0,len(self.cohorts)-1,nlabels) ] labels = [self.cohort_labels[i] for i in skip] return ticks,labels
[docs]class AbsoluteAgeAllNamespaces(Cohort): '''A cohort is the group of people that have started editing in the same month. ''' def __init__(self,minedits=1,maxedits=None): self.cohorts = [int(i) for i in range(0,len(settings.time_stamps))] '''Cohort definition ''' self.cohort_labels = ['%s-%s'%(settings.time_stamps[i][:4],settings.time_stamps[i][4:]) for i in self.cohorts] '''Cohort labels ''' self.sqlQuery = 'SELECT * FROM %s;'%tables.EDITOR_YEAR_MONTH '''The SQL query returns edit information for each editor for each ym she has edited.''' self.minedits = minedits '''Minimum number of edits by editor in a given month to be included''' self.maxedits = maxedits '''Maximum number of edits by editor in a given month to be included''' self.ncolors = utils.numberOfMonths(settings.time_stamps[0],settings.time_stamps[-1])/6 ''' Number of visible colors in the wikipride plots. E.g. one color for every six month for wikipride plots ''' Cohort.__init__(self) def initData(self): self.data['added'] = N.zeros((len(self.cohorts), len(self.time_stamps))) self.data['removed'] = N.zeros((len(self.cohorts), len(self.time_stamps))) self.data['net'] = N.zeros((len(self.cohorts), len(self.time_stamps))) self.data['edits'] = N.zeros((len(self.cohorts), len(self.time_stamps))) self.data['editors'] = N.zeros((len(self.cohorts), len(self.time_stamps))) self.initDataDescription()
[docs] def initDataDescription(self): '''Initialize the self.data_description dictionary with additional information ''' editspan = "%s<edits%s"%(self.minedits,'<%s'%self.maxedits if self.maxedits is not None else '') self.data_description['added'] = { 'title' : 'Megabytes added by editor activity ( %s, %s, all namespaces)'%(editspan, 'no bots' if self.nobots else 'including bots'), \ 'ylabel': 'Megabytes',\ 'ytickslabel' : lambda x : '%d'%(x/1e6) } self.data_description['removed'] = { 'title' : 'Megabytes removed by editor activity ( %s, %s, all namespaces)'%(editspan, 'no bots' if self.nobots else 'including bots'), \ 'ylabel': 'Megabytes',\ 'ytickslabel' : lambda x : '%d'%(x/1e6) } self.data_description['net'] = { 'title' : 'Megabytes Added-Removed by editor activity ( %s, %s, all namespaces)'%(editspan, 'no bots' if self.nobots else 'including bots'), \ 'ylabel': 'Megabytes',\ 'ytickslabel' : lambda x : '%d'%(x/1e6) } self.data_description['edits'] = { 'title' : 'Number of edits by editor activity ( %s, %s, all namespaces)'%(editspan, 'no bots' if self.nobots else 'including bots'), \ 'ylabel': 'Edits' } self.data_description['editors'] = { 'title' : 'Active editor histogram ( %s, %s, all namespaces)'%(editspan, 'no bots' if self.nobots else 'including bots'), \ 'ylabel': 'Number of Editors' }
def processSQLrow(self,row): # try: editor_id = row['user_id'] if utils.isBot(editor_id): return year = row['rev_year'] month = row['rev_month'] ym = '%d%02d'%(year,month) time_index = self.time_stamps_index.get(ym,None) if time_index is None: return firstedit = '%d%02d'%(row['first_edit_year'],row['first_edit_month']) fe_index = self.time_stamps_index.get(firstedit,None) if fe_index is None: return cohorts_index = self.getIndex(fe_index) edits = 0 if row['add_edits'] is not None: edits += int(row['add_edits']) if row['remove_edits'] is not None: edits += int(row['remove_edits']) if row['noop_edits'] is not None: edits += int(row['noop_edits']) if edits < self.minedits or (edits > self.maxedits and self.maxedits is not None): return self.data['editors'][cohorts_index,time_index] += 1 if row['len_added'] is not None: self.data['added'][cohorts_index,time_index] += int(row['len_added']) if row['len_removed'] is not None: self.data['removed'][cohorts_index,time_index] += -int(row['len_removed']) if row['len_added'] is not None and row['len_removed'] is not None: self.data['net'][cohorts_index,time_index] += int(row['len_added']) + int(row['len_removed']) self.data['edits'][cohorts_index,time_index] += edits # except: # raise Exception('row:\n%s'%row)
[docs] def getIndex(self, fe): ''' Returns the index of the cohort, which is identical to the time index of the first edit ''' return fe
[docs] def colorbarTicksAndLabels(self,ncolors): '''Returns ticks and labels for the colorbar of a WikiPride visualization ''' nlabels = ncolors+1 ticks = N.linspace(0, (1.-1./nlabels), nlabels) +0.5/nlabels skip = [ int(i) for i in N.linspace(0,len(self.cohorts)-1,nlabels) ] labels = ['%s-%s'%('1-6' if int(self.cohort_labels[i][:2])<=6 else '7-12',self.cohort_labels[i][-4:]) for i in skip] return ticks,labels
def __repr__(self): '''String representation of cohort. ''' editspan = "%s<edits%s"%(self.minedits,'<%s'%self.maxedits if self.maxedits is not None else '') return "Absolute Age Cohort (%s)"%editspan
[docs]class RelativeAgeAllNamespaces(Cohort): '''A cohort is the group of people that have the same age at the time of an edit. During the first month of editing, a contributor will be in the 1-month old cohort, then he switches to the 2-month cohort and so forth. ''' def __init__(self,minedits=1,maxedits=None): self.cohorts = [int(i) for i in range(0,len(settings.time_stamps))] '''Cohort definition ''' self.cohort_labels = ['%s month old'% i for i in self.cohorts] '''Cohort labels ''' self.sqlQuery = 'SELECT * FROM %s;'%tables.EDITOR_YEAR_MONTH '''The SQL query returns edit information for each editor for each ym she has edited.''' self.minedits = minedits '''Minimum number of edits by editor in a given month to be included''' self.maxedits = maxedits '''Maximum number of edits by editor in a given month to be included''' self.ncolors = utils.numberOfMonths(settings.time_stamps[0],settings.time_stamps[-1])/6 ''' Number of visible colors in the wikipride plots. E.g. one color for every six month for wikipride plots ''' Cohort.__init__(self) def initData(self): self.data['added'] = N.zeros((len(self.cohorts), len(self.time_stamps))) self.data['removed'] = N.zeros((len(self.cohorts), len(self.time_stamps))) self.data['net'] = N.zeros((len(self.cohorts), len(self.time_stamps))) self.data['edits'] = N.zeros((len(self.cohorts), len(self.time_stamps))) self.data['editors'] = N.zeros((len(self.cohorts), len(self.time_stamps))) self.initDataDescription()
[docs] def initDataDescription(self): '''Initialize the self.data_description dictionary with additional information ''' editspan = "%s<edits%s"%(self.minedits,'<%s'%self.maxedits if self.maxedits is not None else '') self.data_description['added'] = { 'title' : 'Megabytes added ( %s, %s, all namespaces)'%(editspan, 'no bots' if self.nobots else 'including bots'), \ 'ylabel': 'Megabytes',\ 'ytickslabel' : lambda x : '%d'%(x/1e6) } self.data_description['removed'] = { 'title' : 'Megabytes removed ( %s, %s, all namespaces)'%(editspan, 'no bots' if self.nobots else 'including bots'), \ 'ylabel': 'Megabytes',\ 'ytickslabel' : lambda x : '%d'%(x/1e6) } self.data_description['net'] = { 'title' : 'Megabytes Added-Removed ( %s, %s, all namespaces)'%(editspan, 'no bots' if self.nobots else 'including bots'), \ 'ylabel': 'Megabytes',\ 'ytickslabel' : lambda x : '%d'%(x/1e6) } self.data_description['edits'] = { 'title' : 'Number of edits ( %s, %s, all namespaces)'%(editspan, 'no bots' if self.nobots else 'including bots'), \ 'ylabel': 'Edits' } self.data_description['editors'] = { 'title' : 'Number of active editors ( %s, %s, all namespaces)'%(editspan, 'no bots' if self.nobots else 'including bots'), \ 'ylabel': 'Number of Editors' }
def processSQLrow(self,row): # try: editor_id = row['user_id'] if utils.isBot(editor_id): return year = row['rev_year'] month = row['rev_month'] ym = '%d%02d'%(year,month) time_index = self.time_stamps_index.get(ym,None) if time_index is None: return firstedit = '%d%02d'%(row['first_edit_year'],row['first_edit_month']) fe_index = self.time_stamps_index.get(firstedit,None) if fe_index is None: return cohorts_index = self.getIndex(time_index, fe_index) edits = 0 if row['add_edits'] is not None: edits += int(row['add_edits']) if row['remove_edits'] is not None: edits += int(row['remove_edits']) if row['noop_edits'] is not None: edits += int(row['noop_edits']) if edits < self.minedits or (edits > self.maxedits and self.maxedits is not None): return self.data['editors'][cohorts_index,time_index] += 1 if row['len_added'] is not None: self.data['added'][cohorts_index,time_index] += int(row['len_added']) if row['len_removed'] is not None: self.data['removed'][cohorts_index,time_index] += -int(row['len_removed']) if row['len_added'] is not None and row['len_removed'] is not None: self.data['net'][cohorts_index,time_index] += int(row['len_added']) + int(row['len_removed']) self.data['edits'][cohorts_index,time_index] += edits # except: # raise Exception('row:\n%s'%row)
[docs] def getIndex(self,ti,fe): ''' Returns the index of the cohort (i.e. the relative age of the editor) from the time index of the edit and time index of the first edit ''' return ti-fe
[docs] def colorbarTicksAndLabels(self,ncolors): '''Returns ticks and labels for the colorbar of a WikiPride visualization ''' nlabels = ncolors+1 ticks = N.linspace(0, 1., nlabels) +1./(nlabels-1)*1/2 skip = [ int(i) for i in N.linspace(0,len(self.cohorts)-1,nlabels) ] labels = ['%s-%s months old'%(self.cohorts[skip[i+1]],self.cohorts[skip[i]+1]) for i in range(0,len(skip)-2)] labels.append('%s-%s months old'%(self.cohorts[-1],self.cohorts[skip[-2]+1])) return ticks,labels
def __repr__(self): '''String representation of cohort. ''' editspan = "%s<edits%s"%(self.minedits,'<%s'%self.maxedits if self.maxedits is not None else '') return "Relative Age Cohort (%s)"%editspan
[docs] def linePlots(self,dest): '''Graphs for relative age cohorts include * Bytes added per edit (new vs. old editors) * Contribution percentage of bytes added for each one year cohort * Editor percentage for each one year cohort ''' logger.info('Creating line plots for %s'%self) editspan = "%s<edits%s"%(self.minedits,'<%s'%self.maxedits if self.maxedits is not None else '') # Bytes added per edit (new vs. old editors) added = self.data['added'] edits = self.data['edits'] editors = self.data['editors'] six = added[0:6,:].sum(axis=0)/(edits[0:6,:].sum(axis=0)+1) moresix = added[7:,:].sum(axis=0)/(edits[7:,:].sum(axis=0)+1) six = utils.movingAverage(array=six, WINDOW=3) moresix = utils.movingAverage(array=moresix, WINDOW=3) fig = self.addLine(data=six,label='new editors (0-6 months active)') fig = self.addLine(data=moresix,fig=fig,label='older editors (>6 months active)') # l = 12 # fig = None # for i in range(0,(added.shape[1]/l)*l,l): # e = edits[(i):(i+l-1),:].sum(axis=0) # e[e==0] = 1 # data = added[(i):(i+l-1),:].sum(axis=0)/e # fig = self.addLine(data=data,fig=fig,label='%s-%s months active'%(i,(i+l-1))) self.saveFigure(name='bytes_per_edit_new_vs_old', fig=fig, dest=dest, title='Bytes added per edit (new vs. old editors, %s)'%editspan,ylabel='Bytes', legendpos=1) # Contribution percentage of bytes added for each one year cohort total = added.sum(axis=0) total[total==0] = 1 l = 12 fig = None for i in range(0,(added.shape[1]/l)*l,l): data = added[(i):(i+l-1),:].sum(axis=0)/total fig = self.addLine(data=data,fig=fig,label='%s-%s months active'%(i,(i+l-1))) self.saveFigure(name='percentage_added_line', fig=fig, dest=dest, title='Contribution percentage of bytes added for editors with %s'%editspan,ylabel='Percentage', legendpos=1) # Editor percentage for each one year cohort total = editors.sum(axis=0) total[total==0] = 1 l = 12 fig = None for i in range(0,(added.shape[1]/l)*l,l): data = editors[(i):(i+l-1),:].sum(axis=0)/total fig = self.addLine(data=data,fig=fig,label='%s-%s months active'%(i,(i+l-1))) self.saveFigure(name='percentage_editor_line', fig=fig, dest=dest, title='Editor age percentage for editors with %s'%editspan,ylabel='Percentage', legendpos=1)