'''For simple cohorts :)
'''
import sys,logging
logger = logging.getLogger('Age cohorts')
try:
import numpy as N
except:
logger.error('Numpy not installed')
import settings
import utils
from cohorts.base import Cohort
from data import tables
[docs]class OneYearCohort(Cohort):
'''A cohort that is comprised of active editors that started editing in a given year.
'''
def __init__(self,year,activation=5,overall=False):
self.year = year
'''The year the cohort started.
'''
self.activation = activation
'''Minimum number of edits per month to be included in the cohort
'''
self.overall = overall
ts,ts_i = utils.create_time_stamps_month(fromym='%s01'%self.year,toym='201012')
self.time_stamps = ts
self.time_stamps_index = ts_i
'''Only take time_stamps starting with self.year
'''
self.cohorts = [self.year,'others']
'''Cohort definition
'''
self.cohort_labels = ['%s cohort'% i for i in self.cohorts]
'''Cohort labels
'''
if self.overall:
self.sqlQuery = "SELECT * FROM fabian WHERE user_id IN (SELECT user_id FROM fabian WHERE first_edit_year=%s GROUP BY user_id HAVING SUM(add_edits)>%s);"%(self.year,self.activation)
else:
self.sqlQuery = 'SELECT * FROM fabian WHERE add_edits > %s AND first_edit_year=%s'%(self.activation,self.year)
Cohort.__init__(self)
def initData(self):
self.data['added'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
self.data['removed'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
self.data['net'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
self.data['edits'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
# self.data['size'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
self.initDataDescription()
[docs] def initDataDescription(self):
'''Initialize the self.data_description dictionary with additional information
'''
self.data_description['added'] = { 'title' : '%s Cohort, >%s edits/month - Megabytes added (%s, namespaces:%s)'%(self.year,self.activation,'no bots' if self.nobots else 'including bots', ','.join(self.NS) if len(self.NS)<16 else 'all'), \
'ylabel': 'Megabytes',\
'ytickslabel' : lambda x : '%d'%(x/1e6) }
self.data_description['removed'] = { 'title' : '%s Cohort, >%s edits/month - Megabytes removed (%s, namespaces:%s)'%(self.year,self.activation,'no bots' if self.nobots else 'including bots', ','.join(self.NS) if len(self.NS)<16 else 'all'), \
'ylabel': 'Megabytes',\
'ytickslabel' : lambda x : '%d'%(x/1e6) }
self.data_description['net'] = { 'title' : '%s Cohort, >%s edits/month - Megabytes Added-Removed (%s, namespaces:%s)'%(self.year,self.activation,'no bots' if self.nobots else 'including bots', ','.join(self.NS) if len(self.NS)<16 else 'all'), \
'ylabel': 'Megabytes',\
'ytickslabel' : lambda x : '%d'%(x/1e6) }
self.data_description['edits'] = { 'title' : '%s Cohort, >%s edits/month - Number of edits (%s, namespaces:%s)'%(self.year,self.activation,'no bots' if self.nobots else 'including bots', ','.join(self.NS) if len(self.NS)<16 else 'all'), \
'ylabel': 'Edits' }
def processSQLrow(self,row):
editor_id = row['user_id']
if utils.isBot(editor_id):
return
year = row['rev_year']
month = row['rev_month']
ns = str(row['namespace'])
ym = '%d%02d'%(year,month)
time_index = self.time_stamps_index.get(ym,None)
if time_index is None:
return
firstedit = '%d%02d'%(row['first_edit_year'],row['first_edit_month'])
fe_index = self.time_stamps_index.get(firstedit,None)
if fe_index is None:
return
cohorts_index = self.getIndex(fe_index)
if ns in self.NS:
self.data['added'][cohorts_index,time_index] += int(row['len_added'])
self.data['removed'][cohorts_index,time_index] += -int(row['len_removed'])
self.data['net'][cohorts_index,time_index] += int(row['len_added']) + int(row['len_removed'])
self.data['edits'][cohorts_index,time_index] += int(row['add_edits'])+int(row['remove_edits'])
[docs] def getIndex(self, fe):
'''
Returns the index of the cohort, which is identical to the time index of the first edit
'''
return 0
[docs]class ProjectSpaceCohorts(Cohort):
'''A cohort that is comprised of active editors that started editing in a given year. Only the contributions to the Wikipedia namespaces 4&5 are considered.
'''
def __init__(self,activation=5):
self.activation = activation
self.NS = ( '4', '5' )
ts,ts_i = utils.create_time_stamps_month(fromym='200401',toym='201012')
self.time_stamps = ts
self.time_stamps_index = ts_i
'''Only take time_stamps starting with self.year
'''
self.cohorts = range(2004,2011)
'''Cohort definition
'''
self.cohort_labels = ['%s cohort'% i for i in self.cohorts]
'''Cohort labels
'''
# self.sqlQuery = 'SELECT * FROM fabian WHERE namespace IN (4,5) AND add_edits > %s AND first_edit_year in (%s)'%(self.activation,','.join([str(c) for c in self.cohorts]))
Cohort.__init__(self)
def initData(self):
self.data['added'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
self.data['removed'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
self.data['net'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
self.data['edits'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
# self.data['size'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
self.initDataDescription()
[docs] def initDataDescription(self):
'''Initialize the self.data_description dictionary with additional information
'''
self.data_description['added'] = { 'title' : 'Megabytes added to Project namespaces, >%s edits/month'%(self.activation), \
'ylabel': 'Megabytes',\
'ytickslabel' : lambda x : '%d'%(x/1e6) }
# self.data_description['removed'] = { 'title' : '%s Cohort, >%s edits/month - Megabytes removed (%s, namespaces:%s)'%(self.year,self.activation,'no bots' if self.nobots else 'including bots', ','.join(self.NS) if len(self.NS)<16 else 'all'), \
# 'ylabel': 'Megabytes',\
# 'ytickslabel' : lambda x : '%d'%(x/1e6) }
# self.data_description['net'] = { 'title' : '%s Cohort, >%s edits/month - Megabytes Added-Removed (%s, namespaces:%s)'%(self.year,self.activation,'no bots' if self.nobots else 'including bots', ','.join(self.NS) if len(self.NS)<16 else 'all'), \
# 'ylabel': 'Megabytes',\
# 'ytickslabel' : lambda x : '%d'%(x/1e6) }
# self.data_description['edits'] = { 'title' : '%s Cohort, >%s edits/month - Number of edits (%s, namespaces:%s)'%(self.year,self.activation,'no bots' if self.nobots else 'including bots', ','.join(self.NS) if len(self.NS)<16 else 'all'), \
# 'ylabel': 'Edits' }
def processSQLrow(self,row):
editor_id = row['user_id']
if utils.isBot(editor_id):
return
year = row['rev_year']
month = row['rev_month']
ns = str(row['namespace'])
ym = '%d%02d'%(year,month)
time_index = self.time_stamps_index.get(ym,None)
if time_index is None:
return
cohorts_index = self.getIndex(row['first_edit_year'])
if ns in self.NS:
self.data['added'][cohorts_index,time_index] += int(row['len_added'])
self.data['removed'][cohorts_index,time_index] += -int(row['len_removed'])
self.data['net'][cohorts_index,time_index] += int(row['len_added']) + int(row['len_removed'])
self.data['edits'][cohorts_index,time_index] += int(row['add_edits'])+int(row['remove_edits'])
[docs] def getIndex(self, y):
'''
Returns the index of the cohort, given the year of the first edit
'''
return self.cohorts.index(y)
[docs] def colorbarTicksAndLabels(self,ncolors):
'''Returns ticks and labels for the colorbar of a WikiPride visualization
'''
nlabels = ncolors+1
# too many dates are unreadable
if nlabels > 15:
nlabels = 15
ticks = N.linspace(0, 1., nlabels) # +1./(nlabels-1)*1/2
skip = [ int(i) for i in N.linspace(0,len(self.cohorts)-1,nlabels) ]
labels = [self.cohort_labels[i] for i in skip]
return ticks,labels
[docs]class NameSpaces(Cohort):
'''The namespaces themselves are cohorts
'''
def __init__(self):
self.cohorts = ('0', '1', '2', '3', '4', '5','other')
'''Cohort definition
'''
self.cohort_labels = ['%s namespace'% i for i in self.cohorts]
'''Cohort labels
'''
self.cohort_index = {'0':0, '1':1, '2':2, '3':3, '4':4, '5':5}
self.sqlQuery = 'SELECT * FROM %s;'%tables.EDITOR_YEAR_MONTH_NAMESPACE
'''The SQL query returns edit information for each editor for each ym she has edited.'''
Cohort.__init__(self)
def initData(self):
self.data['added'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
self.data['removed'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
self.data['net'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
self.data['edits'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
# self.data['size'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
self.initDataDescription()
[docs] def initDataDescription(self):
'''Initialize the self.data_description dictionary with additional information
'''
self.data_description['added'] = { 'title' : 'Megabytes added to namespaces (%s)'%('excluding bots' if self.nobots else 'including bots'), \
'ylabel': 'Megabytes',\
'ytickslabel' : lambda x : '%d'%(x/1e6) }
self.data_description['removed'] = { 'title' : 'Megabytes removed from namespaces (%s)'%('excluding bots' if self.nobots else 'including bots'), \
'ylabel': 'Megabytes',\
'ytickslabel' : lambda x : '%d'%(x/1e6) }
self.data_description['net'] = { 'title' : 'Megabytes added-removed to namespaces (%s)'%('excluding bots' if self.nobots else 'including bots'), \
'ylabel': 'Megabytes',\
'ytickslabel' : lambda x : '%d'%(x/1e6) }
self.data_description['edits'] = { 'title' : 'Number of edits to namespaces (%s)'%('excluding bots' if self.nobots else 'including bots'), \
'ylabel': 'Edits' }
def processSQLrow(self,row):
editor_id = row['user_id']
if utils.isBot(editor_id):
return
year = row['rev_year']
month = row['rev_month']
ns = str(row['namespace'])
ym = '%d%02d'%(year,month)
time_index = self.time_stamps_index.get(ym,None)
if time_index is None:
return
cohorts_index = self.getIndex(ns)
if row['len_added'] is not None:
self.data['added'][cohorts_index,time_index] += int(row['len_added'])
if row['len_removed'] is not None:
self.data['removed'][cohorts_index,time_index] += -int(row['len_removed'])
if row['len_removed'] is not None and row['len_removed'] is not None:
self.data['net'][cohorts_index,time_index] += int(row['len_added']) + int(row['len_removed'])
if row['len_removed'] is not None and row['remove_edits'] is not None:
self.data['edits'][cohorts_index,time_index] += int(row['add_edits'])+int(row['remove_edits'])
[docs] def getIndex(self, ns):
'''
Returns the index of the cohort, given the year of the first edit
'''
return self.cohort_index.get(ns,6)
[docs] def colorbarTicksAndLabels(self,ncolors):
'''Returns ticks and labels for the colorbar of a WikiPride visualization
'''
nlabels = ncolors
ticks = N.linspace(0, (1.-1./nlabels), nlabels) +0.5/nlabels
# skip = [ int(i) for i in N.linspace(0,len(self.cohorts)-1,nlabels+1) ]
# labels = [self.cohort_labels[i] for i in skip]
labels = self.cohort_labels
return ticks,labels
[docs]class NewEditors(Cohort):
'''There is just one cohort, which contains the number of of editors who started contributing in any given month.
:meth:`.NewEditors.linePlot` creates a line plot.
'''
def __init__(self):
self.cohorts = ['New Editors']
'''Cohort definition
'''
self.cohort_labels = self.cohorts
'''Cohort labels
'''
self.sqlQuery = """SELECT
first_edit_year,
first_edit_month,
count(*) AS recruits
FROM
%s
GROUP BY
first_edit_year,
first_edit_month;"""%tables.USER_COHORT
'''The SQL query returns the new editor count for each ym.'''
Cohort.__init__(self)
def initData(self):
self.data['editors'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
# self.data['size'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
self.initDataDescription()
[docs] def initDataDescription(self):
'''Initialize the self.data_description dictionary with additional information
'''
self.data_description['editors'] = { 'title' : 'Number of new editors (first edit)', \
'ylabel': '# Editors' }
def processSQLrow(self,row):
firstedit = '%d%02d'%(row['first_edit_year'],row['first_edit_month'])
fe_index = self.time_stamps_index.get(firstedit,None)
if fe_index is None:
return
# there is only one cohort
cohorts_index = 0
self.data['editors'][cohorts_index,fe_index] += row['recruits']
[docs] def getIndex(self, ns):
'''
Not needed in this cohort!
'''
raise Exception("NO!")
[docs] def colorbarTicksAndLabels(self,ncolors):
'''Returns ticks and labels for the colorbar of a WikiPride visualization
'''
nlabels = ncolors
ticks = N.linspace(0, (1.-1./nlabels), nlabels) +0.5/nlabels
# skip = [ int(i) for i in N.linspace(0,len(self.cohorts)-1,nlabels+1) ]
# labels = [self.cohort_labels[i] for i in skip]
labels = self.cohort_labels
return ticks,labels
[docs] def linePlots(self,dest):
'''Creates a line plot for the number of new editors and saves it to disk.
:arg dest: str, destination directory
'''
logger.info('Creating line plots for New editors cohort.')
fig = self.addLine(self.data['editors'][0,:])
self.saveFigure(name='line', fig=fig, dest=dest, title=self.data_description['editors']['title'],ylabel=self.data_description['editors']['ylabel'])