Source code for cohorts.simple

'''For simple cohorts :) 


'''

import sys,logging
logger = logging.getLogger('Age cohorts')

try:
    import numpy as N
except:
    logger.error('Numpy not installed')
    

import settings
import utils

from cohorts.base import Cohort
from data import tables


[docs]class OneYearCohort(Cohort): '''A cohort that is comprised of active editors that started editing in a given year. ''' def __init__(self,year,activation=5,overall=False): self.year = year '''The year the cohort started. ''' self.activation = activation '''Minimum number of edits per month to be included in the cohort ''' self.overall = overall ts,ts_i = utils.create_time_stamps_month(fromym='%s01'%self.year,toym='201012') self.time_stamps = ts self.time_stamps_index = ts_i '''Only take time_stamps starting with self.year ''' self.cohorts = [self.year,'others'] '''Cohort definition ''' self.cohort_labels = ['%s cohort'% i for i in self.cohorts] '''Cohort labels ''' if self.overall: self.sqlQuery = "SELECT * FROM fabian WHERE user_id IN (SELECT user_id FROM fabian WHERE first_edit_year=%s GROUP BY user_id HAVING SUM(add_edits)>%s);"%(self.year,self.activation) else: self.sqlQuery = 'SELECT * FROM fabian WHERE add_edits > %s AND first_edit_year=%s'%(self.activation,self.year) Cohort.__init__(self) def initData(self): self.data['added'] = N.zeros((len(self.cohorts), len(self.time_stamps))) self.data['removed'] = N.zeros((len(self.cohorts), len(self.time_stamps))) self.data['net'] = N.zeros((len(self.cohorts), len(self.time_stamps))) self.data['edits'] = N.zeros((len(self.cohorts), len(self.time_stamps))) # self.data['size'] = N.zeros((len(self.cohorts), len(self.time_stamps))) self.initDataDescription()
[docs] def initDataDescription(self): '''Initialize the self.data_description dictionary with additional information ''' self.data_description['added'] = { 'title' : '%s Cohort, >%s edits/month - Megabytes added (%s, namespaces:%s)'%(self.year,self.activation,'no bots' if self.nobots else 'including bots', ','.join(self.NS) if len(self.NS)<16 else 'all'), \ 'ylabel': 'Megabytes',\ 'ytickslabel' : lambda x : '%d'%(x/1e6) } self.data_description['removed'] = { 'title' : '%s Cohort, >%s edits/month - Megabytes removed (%s, namespaces:%s)'%(self.year,self.activation,'no bots' if self.nobots else 'including bots', ','.join(self.NS) if len(self.NS)<16 else 'all'), \ 'ylabel': 'Megabytes',\ 'ytickslabel' : lambda x : '%d'%(x/1e6) } self.data_description['net'] = { 'title' : '%s Cohort, >%s edits/month - Megabytes Added-Removed (%s, namespaces:%s)'%(self.year,self.activation,'no bots' if self.nobots else 'including bots', ','.join(self.NS) if len(self.NS)<16 else 'all'), \ 'ylabel': 'Megabytes',\ 'ytickslabel' : lambda x : '%d'%(x/1e6) } self.data_description['edits'] = { 'title' : '%s Cohort, >%s edits/month - Number of edits (%s, namespaces:%s)'%(self.year,self.activation,'no bots' if self.nobots else 'including bots', ','.join(self.NS) if len(self.NS)<16 else 'all'), \ 'ylabel': 'Edits' }
def processSQLrow(self,row): editor_id = row['user_id'] if utils.isBot(editor_id): return year = row['rev_year'] month = row['rev_month'] ns = str(row['namespace']) ym = '%d%02d'%(year,month) time_index = self.time_stamps_index.get(ym,None) if time_index is None: return firstedit = '%d%02d'%(row['first_edit_year'],row['first_edit_month']) fe_index = self.time_stamps_index.get(firstedit,None) if fe_index is None: return cohorts_index = self.getIndex(fe_index) if ns in self.NS: self.data['added'][cohorts_index,time_index] += int(row['len_added']) self.data['removed'][cohorts_index,time_index] += -int(row['len_removed']) self.data['net'][cohorts_index,time_index] += int(row['len_added']) + int(row['len_removed']) self.data['edits'][cohorts_index,time_index] += int(row['add_edits'])+int(row['remove_edits'])
[docs] def getIndex(self, fe): ''' Returns the index of the cohort, which is identical to the time index of the first edit ''' return 0
[docs]class ProjectSpaceCohorts(Cohort): '''A cohort that is comprised of active editors that started editing in a given year. Only the contributions to the Wikipedia namespaces 4&5 are considered. ''' def __init__(self,activation=5): self.activation = activation self.NS = ( '4', '5' ) ts,ts_i = utils.create_time_stamps_month(fromym='200401',toym='201012') self.time_stamps = ts self.time_stamps_index = ts_i '''Only take time_stamps starting with self.year ''' self.cohorts = range(2004,2011) '''Cohort definition ''' self.cohort_labels = ['%s cohort'% i for i in self.cohorts] '''Cohort labels ''' # self.sqlQuery = 'SELECT * FROM fabian WHERE namespace IN (4,5) AND add_edits > %s AND first_edit_year in (%s)'%(self.activation,','.join([str(c) for c in self.cohorts])) Cohort.__init__(self) def initData(self): self.data['added'] = N.zeros((len(self.cohorts), len(self.time_stamps))) self.data['removed'] = N.zeros((len(self.cohorts), len(self.time_stamps))) self.data['net'] = N.zeros((len(self.cohorts), len(self.time_stamps))) self.data['edits'] = N.zeros((len(self.cohorts), len(self.time_stamps))) # self.data['size'] = N.zeros((len(self.cohorts), len(self.time_stamps))) self.initDataDescription()
[docs] def initDataDescription(self): '''Initialize the self.data_description dictionary with additional information ''' self.data_description['added'] = { 'title' : 'Megabytes added to Project namespaces, >%s edits/month'%(self.activation), \ 'ylabel': 'Megabytes',\ 'ytickslabel' : lambda x : '%d'%(x/1e6) } # self.data_description['removed'] = { 'title' : '%s Cohort, >%s edits/month - Megabytes removed (%s, namespaces:%s)'%(self.year,self.activation,'no bots' if self.nobots else 'including bots', ','.join(self.NS) if len(self.NS)<16 else 'all'), \ # 'ylabel': 'Megabytes',\ # 'ytickslabel' : lambda x : '%d'%(x/1e6) } # self.data_description['net'] = { 'title' : '%s Cohort, >%s edits/month - Megabytes Added-Removed (%s, namespaces:%s)'%(self.year,self.activation,'no bots' if self.nobots else 'including bots', ','.join(self.NS) if len(self.NS)<16 else 'all'), \ # 'ylabel': 'Megabytes',\ # 'ytickslabel' : lambda x : '%d'%(x/1e6) } # self.data_description['edits'] = { 'title' : '%s Cohort, >%s edits/month - Number of edits (%s, namespaces:%s)'%(self.year,self.activation,'no bots' if self.nobots else 'including bots', ','.join(self.NS) if len(self.NS)<16 else 'all'), \ # 'ylabel': 'Edits' }
def processSQLrow(self,row): editor_id = row['user_id'] if utils.isBot(editor_id): return year = row['rev_year'] month = row['rev_month'] ns = str(row['namespace']) ym = '%d%02d'%(year,month) time_index = self.time_stamps_index.get(ym,None) if time_index is None: return cohorts_index = self.getIndex(row['first_edit_year']) if ns in self.NS: self.data['added'][cohorts_index,time_index] += int(row['len_added']) self.data['removed'][cohorts_index,time_index] += -int(row['len_removed']) self.data['net'][cohorts_index,time_index] += int(row['len_added']) + int(row['len_removed']) self.data['edits'][cohorts_index,time_index] += int(row['add_edits'])+int(row['remove_edits'])
[docs] def getIndex(self, y): ''' Returns the index of the cohort, given the year of the first edit ''' return self.cohorts.index(y)
[docs] def colorbarTicksAndLabels(self,ncolors): '''Returns ticks and labels for the colorbar of a WikiPride visualization ''' nlabels = ncolors+1 # too many dates are unreadable if nlabels > 15: nlabels = 15 ticks = N.linspace(0, 1., nlabels) # +1./(nlabels-1)*1/2 skip = [ int(i) for i in N.linspace(0,len(self.cohorts)-1,nlabels) ] labels = [self.cohort_labels[i] for i in skip] return ticks,labels
[docs]class NameSpaces(Cohort): '''The namespaces themselves are cohorts ''' def __init__(self): self.cohorts = ('0', '1', '2', '3', '4', '5','other') '''Cohort definition ''' self.cohort_labels = ['%s namespace'% i for i in self.cohorts] '''Cohort labels ''' self.cohort_index = {'0':0, '1':1, '2':2, '3':3, '4':4, '5':5} self.sqlQuery = 'SELECT * FROM %s;'%tables.EDITOR_YEAR_MONTH_NAMESPACE '''The SQL query returns edit information for each editor for each ym she has edited.''' Cohort.__init__(self) def initData(self): self.data['added'] = N.zeros((len(self.cohorts), len(self.time_stamps))) self.data['removed'] = N.zeros((len(self.cohorts), len(self.time_stamps))) self.data['net'] = N.zeros((len(self.cohorts), len(self.time_stamps))) self.data['edits'] = N.zeros((len(self.cohorts), len(self.time_stamps))) # self.data['size'] = N.zeros((len(self.cohorts), len(self.time_stamps))) self.initDataDescription()
[docs] def initDataDescription(self): '''Initialize the self.data_description dictionary with additional information ''' self.data_description['added'] = { 'title' : 'Megabytes added to namespaces (%s)'%('excluding bots' if self.nobots else 'including bots'), \ 'ylabel': 'Megabytes',\ 'ytickslabel' : lambda x : '%d'%(x/1e6) } self.data_description['removed'] = { 'title' : 'Megabytes removed from namespaces (%s)'%('excluding bots' if self.nobots else 'including bots'), \ 'ylabel': 'Megabytes',\ 'ytickslabel' : lambda x : '%d'%(x/1e6) } self.data_description['net'] = { 'title' : 'Megabytes added-removed to namespaces (%s)'%('excluding bots' if self.nobots else 'including bots'), \ 'ylabel': 'Megabytes',\ 'ytickslabel' : lambda x : '%d'%(x/1e6) } self.data_description['edits'] = { 'title' : 'Number of edits to namespaces (%s)'%('excluding bots' if self.nobots else 'including bots'), \ 'ylabel': 'Edits' }
def processSQLrow(self,row): editor_id = row['user_id'] if utils.isBot(editor_id): return year = row['rev_year'] month = row['rev_month'] ns = str(row['namespace']) ym = '%d%02d'%(year,month) time_index = self.time_stamps_index.get(ym,None) if time_index is None: return cohorts_index = self.getIndex(ns) if row['len_added'] is not None: self.data['added'][cohorts_index,time_index] += int(row['len_added']) if row['len_removed'] is not None: self.data['removed'][cohorts_index,time_index] += -int(row['len_removed']) if row['len_removed'] is not None and row['len_removed'] is not None: self.data['net'][cohorts_index,time_index] += int(row['len_added']) + int(row['len_removed']) if row['len_removed'] is not None and row['remove_edits'] is not None: self.data['edits'][cohorts_index,time_index] += int(row['add_edits'])+int(row['remove_edits'])
[docs] def getIndex(self, ns): ''' Returns the index of the cohort, given the year of the first edit ''' return self.cohort_index.get(ns,6)
[docs] def colorbarTicksAndLabels(self,ncolors): '''Returns ticks and labels for the colorbar of a WikiPride visualization ''' nlabels = ncolors ticks = N.linspace(0, (1.-1./nlabels), nlabels) +0.5/nlabels # skip = [ int(i) for i in N.linspace(0,len(self.cohorts)-1,nlabels+1) ] # labels = [self.cohort_labels[i] for i in skip] labels = self.cohort_labels return ticks,labels
[docs]class NewEditors(Cohort): '''There is just one cohort, which contains the number of of editors who started contributing in any given month. :meth:`.NewEditors.linePlot` creates a line plot. ''' def __init__(self): self.cohorts = ['New Editors'] '''Cohort definition ''' self.cohort_labels = self.cohorts '''Cohort labels ''' self.sqlQuery = """SELECT first_edit_year, first_edit_month, count(*) AS recruits FROM %s GROUP BY first_edit_year, first_edit_month;"""%tables.USER_COHORT '''The SQL query returns the new editor count for each ym.''' Cohort.__init__(self) def initData(self): self.data['editors'] = N.zeros((len(self.cohorts), len(self.time_stamps))) # self.data['size'] = N.zeros((len(self.cohorts), len(self.time_stamps))) self.initDataDescription()
[docs] def initDataDescription(self): '''Initialize the self.data_description dictionary with additional information ''' self.data_description['editors'] = { 'title' : 'Number of new editors (first edit)', \ 'ylabel': '# Editors' }
def processSQLrow(self,row): firstedit = '%d%02d'%(row['first_edit_year'],row['first_edit_month']) fe_index = self.time_stamps_index.get(firstedit,None) if fe_index is None: return # there is only one cohort cohorts_index = 0 self.data['editors'][cohorts_index,fe_index] += row['recruits']
[docs] def getIndex(self, ns): ''' Not needed in this cohort! ''' raise Exception("NO!")
[docs] def colorbarTicksAndLabels(self,ncolors): '''Returns ticks and labels for the colorbar of a WikiPride visualization ''' nlabels = ncolors ticks = N.linspace(0, (1.-1./nlabels), nlabels) +0.5/nlabels # skip = [ int(i) for i in N.linspace(0,len(self.cohorts)-1,nlabels+1) ] # labels = [self.cohort_labels[i] for i in skip] labels = self.cohort_labels return ticks,labels
[docs] def linePlots(self,dest): '''Creates a line plot for the number of new editors and saves it to disk. :arg dest: str, destination directory ''' logger.info('Creating line plots for New editors cohort.') fig = self.addLine(self.data['editors'][0,:]) self.saveFigure(name='line', fig=fig, dest=dest, title=self.data_description['editors']['title'],ylabel=self.data_description['editors']['ylabel'])