'''This module implements histograms cohorts, e.g. EditsHistogram.
'''
import sys,logging
logger = logging.getLogger('Histogram cohorts')
try:
import numpy as N
except:
logger.error('Numpy not installed')
import settings
import utils
from cohorts.base import Cohort
from data import tables
[docs]class EditsHistogram(Cohort):
'''The cohorts are based on the number of edits they have done in a given month. Implemented only for MongoDB.
'''
def __init__(self):
# We don't take into consideration people who have 0 edits as the data is coded sparse and we don't have
# this information
# Note that len(self.cohorts) has to be equal the number of bins in the numpy.array
# self.cohorts = [1,3,5,7,10,20,40,60,80,100,200,500,1000,5000,10000,'>10000 edits']
self.cohorts = [1,5,10,50,100,500,1000,'>1000 edits']
'''Cohort definition
'''
self.cohort_labels = self.cohorts[:]
self.cohort_labels[0] = '%s edit'%(self.cohort_labels[0])
for i in range(1,len(self.cohorts)-1):
self.cohort_labels[i] = '%s-%s edits'%(self.cohorts[i-1]+1,self.cohorts[i])
# self.cohort_labels = ['<%s edits'%(e) for e in self.cohorts]
'''Cohort labels
'''
Cohort.__init__(self)
[docs] def getIndex(self, edits):
'''
Returns the index of the cohort
'''
for i,e in enumerate(self.cohorts):
if edits <= e:
return i
return len(self.cohorts)-1
def processMongoDocument(self,editor):
if 'edit_count' not in editor:
return
editor_id = editor['user_id']
if utils.isBot(editor_id):
return
for year,edity in editor['edit_count'].items():
for month,editm in edity.items():
# extract year and month 20xxxx
ym = '%s%02d'%(year,int(month))
time_index = self.time_stamps_index.get(ym,None)
if time_index is None:
continue
# cohort index depends on the aggregate of the namespaces
nedits = 0
for ns,ecount in editm.items():
if ns in self.NS:
# count edits
nedits += ecount
if nedits > 0:
# Sparse representation. Only collecting >0 edits because it is possible that an editor has no edits in that namespace, but edits in other namespaces.
cohorts_index = self.getIndex(nedits)
# increment the histogram bin for that editor and month
# note: as the data is sparse, it means we don't count the number of people with zero edits in that month
self.data['editsHistogram'][cohorts_index,time_index] += 1
def initData(self):
self.data['editsHistogram'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
self.initDataDescription()
def initDataDescription(self):
self.data_description['editsHistogram'] = { 'title' : 'Histogram of the number of edits',
'ylabel' : 'Number of Editors'}
[docs] def getColor(self, i):
'''
Returns a color based on the index of the cohort i
'''
return self.colors[i]
[docs] def colorbarTicksAndLabels(self,ncolors):
'''Returns ticks and labels for the colorbar of a WikiPride visualization
'''
nlabels = ncolors
ticks = N.linspace(0, (1.-1./nlabels), nlabels) +0.5/nlabels
skip = [ int(i) for i in N.linspace(0,len(self.cohorts)-1,nlabels) ]
labels = [self.cohort_labels[i] for i in skip]
return ticks,labels
[docs]class EditorActivity(Cohort):
'''The cohorts are based on the number of edits they have done in a given month. It uses a table where the values are aggregated for all namespaces.
'''
def __init__(self):
# We don't take into consideration people who have 0 edits as the data is coded sparse and we don't have
# this information
self.cohorts = [1,5,50,100,500,1000,'>1000 edits']
'''Cohort definition
'''
self.cohort_labels = self.cohorts[:]
self.cohort_labels[0] = '%s edit'%(self.cohort_labels[0])
for i in range(1,len(self.cohorts)-1):
self.cohort_labels[i] = '%s-%s edits'%(self.cohorts[i-1]+1,self.cohorts[i])
# self.cohort_labels = ['<%s edits'%(e) for e in self.cohorts]
'''Cohort labels
'''
self.sqlQuery = 'SELECT * FROM %s;'%tables.EDITOR_YEAR_MONTH
'''The SQL query returns edit information for each editor for each ym she has edited.'''
Cohort.__init__(self)
[docs] def getIndex(self, edits):
'''
Returns the index of the cohort
'''
for i,e in enumerate(self.cohorts):
if edits <= e:
return i
return len(self.cohorts)-1
def initData(self):
self.data['added'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
self.data['removed'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
self.data['net'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
self.data['edits'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
self.data['editors'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
self.initDataDescription()
[docs] def initDataDescription(self):
'''Initialize the self.data_description dictionary with information used for plotting.
'''
self.data_description['added'] = { 'title' : 'Megabytes added by editor activity ( %s, namespaces:All)'%('no bots' if self.nobots else 'including bots'), \
'ylabel': 'Megabytes',\
'ytickslabel' : lambda x : '%d'%(x/1e6) }
self.data_description['removed'] = { 'title' : 'Megabytes removed by editor activity ( %s, namespaces:All)'%('no bots' if self.nobots else 'including bots'), \
'ylabel': 'Megabytes',\
'ytickslabel' : lambda x : '%d'%(x/1e6) }
self.data_description['net'] = { 'title' : 'Megabytes Added-Removed by editor activity ( %s, namespaces:All)'%('no bots' if self.nobots else 'including bots'), \
'ylabel': 'Megabytes',\
'ytickslabel' : lambda x : '%d'%(x/1e6) }
self.data_description['edits'] = { 'title' : 'Number of edits by editor activity ( %s, namespaces:All)'%('no bots' if self.nobots else 'including bots'), \
'ylabel': 'Edits' }
self.data_description['editors'] = { 'title' : 'Active editor histogram (%s, namespaces:All)'%('no bots' if self.nobots else 'including bots'), \
'ylabel': 'Number of Editors' }
def processSQLrow(self,row):
# try:
editor_id = row['user_id']
if utils.isBot(editor_id):
return
year = row['rev_year']
month = row['rev_month']
ym = '%d%02d'%(year,month)
time_index = self.time_stamps_index.get(ym,None)
if time_index is None:
return
edits = 0
if row['add_edits'] is not None:
edits += int(row['add_edits'])
if row['remove_edits'] is not None:
edits += int(row['remove_edits'])
if row['noop_edits'] is not None:
edits += int(row['noop_edits'])
cohorts_index = self.getIndex(edits)
self.data['editors'][cohorts_index,time_index] += 1
if row['len_added'] is not None:
self.data['added'][cohorts_index,time_index] += int(row['len_added'])
if row['len_removed'] is not None:
self.data['removed'][cohorts_index,time_index] += -int(row['len_removed'])
if row['len_added'] is not None and row['len_removed'] is not None:
self.data['net'][cohorts_index,time_index] += int(row['len_added']) + int(row['len_removed'])
self.data['edits'][cohorts_index,time_index] += edits
# except:
# raise Exception('row:\n%s'%row)
[docs] def getColor(self, i):
'''
Returns a color based on the index of the cohort i
'''
return self.colors[i]
[docs] def colorbarTicksAndLabels(self,ncolors):
'''Returns ticks and labels for the colorbar of a WikiPride visualization
'''
nlabels = ncolors
ticks = N.linspace(0, (1.-1./nlabels), nlabels) +0.5/nlabels
skip = [ int(i) for i in N.linspace(0,len(self.cohorts)-1,nlabels) ]
labels = [self.cohort_labels[i] for i in skip]
return ticks,labels
[docs] def linePlots(self,dest):
'''Graphs for editor activity histogram cohort include
* Number of editors by activity
* Number of edits by activity
* Bytes added by activity
* Bytes added per editor
* Bytes added per edit
* Edits per editor
* The first year of one-year cohorts in one plot (x-axis is age, not time)
'''
logger.info('Creating line plots for %s'%self)
def graph(d,title,ylabel,ylog=False,loc=None):
fig = None
for i in range(d.shape[0]):
label = self.cohort_labels[i]
fig = self.addLine(data=d[i,:],fig=fig,label=label)
self.saveFigure(name=title, fig=fig, dest=dest, title=title,ylabel=ylabel,ylog=ylog,legendpos=loc)
added = self.data['added']
edits = self.data['edits']
editors = self.data['editors']
graph(d=editors,title="Number of editors by activity", ylabel="# of editors",loc=2)
graph(d=edits,title="Number of edits by activity", ylabel="# of edits",loc=2)
graph(d=added,title="Bytes added by activity", ylabel="Bytes",loc=2)
editors[editors==0]=1
edits[edits==0]=1
graph(d=added/editors,title="Bytes added per editor", ylabel="Bytes (log scale)",ylog=True,loc=2)
graph(d=edits/editors,title="Edits per editor", ylabel="# of edits (log scale)",ylog=True,loc=1)
graph(d=added/edits,title="Bytes added per edit", ylabel="Bytes",loc=1)
# The first year of one-year cohorts in one plot (x-axis is age, not time)
editors = self.data['editors']
l = 12
fig = None
for i in range(0,(added.shape[1]/l)*l,l):
e = edits[(i):(i+l-1),:].sum(axis=0)
e[e==0] = 1
data = added[(i):(i+l-1),:].sum(axis=0)/e
fig = self.addLine(data=data,fig=fig,label='%s-%s months active'%(i,(i+l-1)))
[docs]class NewEditorActivity(Cohort):
'''The cohorts are based on the number of edits they have done in a given month.
'''
def __init__(self,period=3 ):
# We don't take into consideration people who have 0 edits as the data is coded sparse and we don't have
# this information
# Note that len(self.cohorts) has to be equal the number of bins in the numpy.array
# self.cohorts = [1,3,5,7,10,20,40,60,80,100,200,500,1000,5000,10000,'>10000 edits']
self.cohorts = [1,5,10,50,100,500,1000,'>1000 edits']
'''Cohort definition
'''
self.cohort_labels = self.cohorts[:]
self.cohort_labels[0] = '%s edit'%(self.cohort_labels[0])
for i in range(1,len(self.cohorts)-1):
self.cohort_labels[i] = '%s-%s edits'%(self.cohorts[i-1]+1,self.cohorts[i])
# self.cohort_labels = ['<%s edits'%(e) for e in self.cohorts]
'''Cohort labels
'''
self.period = period
'''The number of month an editor is considered new
'''
self.old_user_id = None
'''The user_id of the previously encountered editor as we iterate through the table
'''
self.lastym = None
'''The ym at the end of the `period` months after the first edit of an editor
'''
self.firstedit = None
self.fe_index = None
self.edits = 0
self.added = 0
self.removed = 0
self.net = 0
Cohort.__init__(self)
[docs] def getIndex(self, edits):
'''
Returns the index of the cohort
'''
for i,e in enumerate(self.cohorts):
if edits <= e:
return i
return len(self.cohorts)-1
def processSQLrow(self,row):
# try:
editor_id = row['user_id']
if utils.isBot(editor_id):
return
if editor_id != self.old_user_id:
'''When encountering a new editor, we extract the firstedit and the last ym of interest
'''
if self.old_user_id is not None:
# we just finished aggregating information for an editor
#getting bin of the number of edits the editor has done
cohorts_index = self.getIndex(self.edits)
self.data['editors'][cohorts_index,self.fe_index] += 1
self.data['added'][cohorts_index,self.fe_index] = self.added
self.data['removed'][cohorts_index,self.fe_index] = self.removed
self.data['net'][cohorts_index,self.fe_index] = self.net
self.data['edits'][cohorts_index,self.fe_index] = self.edits
# initializing info for current editor
self.old_user_id = editor_id
year = row['first_edit_year']
month = row['first_edit_month']
self.firstedit = '%d%02d'%(year,month)
self.fe_index = self.time_stamps_index.get(self.firstedit,None)
if self.fe_index is None:
return
(m,y) = ((month+self.period)%12,year+(month+self.period)/12)
self.lastym = '%d%02d'%(y,m)
self.edits = 0
self.added = 0
self.removed = 0
self.net = 0
if self.fe_index is None:
return
year = row['rev_year']
month = row['rev_month']
ym = '%d%02d'%(year,month)
if ym > self.lastym:
return
if row['add_edits'] is not None:
self.edits += int(row['add_edits'])
if row['remove_edits'] is not None:
self.edits += int(row['remove_edits'])
if row['noop_edits'] is not None:
self.edits += int(row['noop_edits'])
if row['len_added'] is not None:
self.added += int(row['len_added'])
if row['len_removed'] is not None:
self.removed += -int(row['len_removed'])
if row['len_added'] is not None and row['len_removed'] is not None:
self.net += int(row['len_added']) + int(row['len_removed'])
def initData(self):
self.data['added'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
self.data['removed'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
self.data['net'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
self.data['edits'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
self.data['editors'] = N.zeros((len(self.cohorts), len(self.time_stamps)))
self.initDataDescription()
[docs] def initDataDescription(self):
'''Initialize the self.data_description dictionary with information used for plotting.
'''
self.data_description['added'] = { 'title' : 'Megabytes added by new editor activity ( <%s months, %s, namespaces:All)'%(self.period, 'no bots' if self.nobots else 'including bots'), \
'ylabel': 'Megabytes',\
'ytickslabel' : lambda x : '%d'%(x/1e6) }
self.data_description['removed'] = { 'title' : 'Megabytes removed by new editor activity ( <%s months, %s, namespaces:All)'%(self.period, 'no bots' if self.nobots else 'including bots'), \
'ylabel': 'Megabytes',\
'ytickslabel' : lambda x : '%d'%(x/1e6) }
self.data_description['net'] = { 'title' : 'Megabytes Added-Removed by new editor activity ( <%s months, %s, namespaces:All)'%(self.period, 'no bots' if self.nobots else 'including bots'), \
'ylabel': 'Megabytes',\
'ytickslabel' : lambda x : '%d'%(x/1e6) }
self.data_description['edits'] = { 'title' : 'Number of edits by new editor activity ( <%s months, %s, namespaces:All)'%(self.period,'no bots' if self.nobots else 'including bots'), \
'ylabel': 'Edits' }
self.data_description['editors'] = { 'title' : 'Edits Histogram for the first %s month of an edit activity (%s, namespaces:All)'%(self.period, 'no bots' if self.nobots else 'including bots'), \
'ylabel': 'Number of Editors' }
[docs] def getColor(self, i):
'''
Returns a color based on the index of the cohort i
'''
return self.colors[i]
[docs] def colorbarTicksAndLabels(self,ncolors):
'''Returns ticks and labels for the colorbar of a WikiPride visualization
'''
nlabels = ncolors
ticks = N.linspace(0, (1.-1./nlabels), nlabels) +0.5/nlabels
skip = [ int(i) for i in N.linspace(0,len(self.cohorts)-1,nlabels) ]
labels = [self.cohort_labels[i] for i in skip]
return ticks,labels