analytics: Add class DependentCountStat and stat realm_active_humans::day.

This commit is contained in:
Rishi Gupta 2017-04-04 22:51:55 -07:00 committed by Tim Abbott
parent 62de1cf898
commit 49bd330304
2 changed files with 197 additions and 10 deletions

View File

@ -4,15 +4,16 @@ from django.db.models import F
from django.utils import timezone
from analytics.models import InstallationCount, RealmCount, \
UserCount, StreamCount, BaseCount, FillState, Anomaly, installation_epoch
UserCount, StreamCount, BaseCount, FillState, Anomaly, installation_epoch, \
last_successful_fill
from zerver.models import Realm, UserProfile, Message, Stream, \
UserActivityInterval, RealmAuditLog, models
from zerver.lib.timestamp import floor_to_day, floor_to_hour, ceiling_to_day, \
ceiling_to_hour
from typing import Any, Callable, Dict, Optional, Text, Tuple, Type, Union
from typing import Any, Callable, Dict, List, Optional, Text, Tuple, Type, Union
from collections import defaultdict
from collections import defaultdict, OrderedDict
from datetime import timedelta, datetime
import logging
import time
@ -64,6 +65,12 @@ class LoggingCountStat(CountStat):
# type: (str, Type[BaseCount], str) -> None
CountStat.__init__(self, property, DataCollector(output_table, None), frequency)
class DependentCountStat(CountStat):
def __init__(self, property, data_collector, frequency, interval=None, dependencies=[]):
# type: (str, DataCollector, str, Optional[timedelta], List[str]) -> None
CountStat.__init__(self, property, data_collector, frequency, interval=interval)
self.dependencies = dependencies
class DataCollector(object):
def __init__(self, output_table, pull_function):
# type: (Type[BaseCount], Optional[Callable[[str, datetime, datetime], int]]) -> None
@ -92,6 +99,15 @@ def process_count_stat(stat, fill_to_time):
else:
raise AssertionError("Unknown value for FillState.state: %s." % (fill_state.state,))
if isinstance(stat, DependentCountStat):
for dependency in stat.dependencies:
dependency_fill_time = last_successful_fill(dependency)
if dependency_fill_time is None:
logger.warning("DependentCountStat %s run before dependency %s." %
(stat.property, dependency))
return
fill_to_time = min(fill_to_time, dependency_fill_time)
currently_filled = currently_filled + timedelta(hours = 1)
while currently_filled <= fill_to_time:
logger.info("START %s %s" % (stat.property, currently_filled))
@ -229,8 +245,8 @@ def do_pull_by_sql_query(property, start_time, end_time, query, group_by):
# We do string replacement here because cursor.execute will reject a
# group_by_clause given as a param.
# We pass in the datetimes as params so that we don't have to think about
# how to convert python datetimes to SQL datetimes.
# We pass in the datetimes as params to cursor.execute so that we don't have to
# think about how to convert python datetimes to SQL datetimes.
query_ = query % {'property': property, 'subgroup': subgroup,
'group_by_clause': group_by_clause}
cursor = connection.cursor()
@ -407,6 +423,31 @@ check_useractivityinterval_by_user_query = """
GROUP BY zerver_userprofile.id %(group_by_clause)s
"""
count_realm_active_humans_query = """
INSERT INTO analytics_realmcount
(realm_id, value, property, subgroup, end_time)
SELECT
usercount1.realm_id, count(*), '%(property)s', NULL, %%(time_end)s
FROM (
SELECT realm_id, user_id
FROM analytics_usercount
WHERE
property = 'active_users_audit:is_bot:day' AND
subgroup = 'false' AND
end_time = %%(time_end)s
) usercount1
JOIN (
SELECT realm_id, user_id
FROM analytics_usercount
WHERE
property = '15day_actives::day' AND
end_time = %%(time_end)s
) usercount2
ON
usercount1.user_id = usercount2.user_id
GROUP BY usercount1.realm_id
"""
# Currently unused and untested
count_stream_by_realm_query = """
INSERT INTO analytics_realmcount
@ -450,6 +491,7 @@ count_stats_ = [
# latter stat was introduced.
# 'active_users_audit:is_bot:day' is the canonical record of which users were
# active on which days (in the UserProfile.is_active sense).
# Important that this stay a daily stat, so that 'realm_active_humans::day' works as expected.
CountStat('active_users_audit:is_bot:day',
sql_data_collector(UserCount, check_realmauditlog_by_user_query, (UserProfile, 'is_bot')),
CountStat.DAY),
@ -460,7 +502,13 @@ count_stats_ = [
CountStat('15day_actives::day',
sql_data_collector(UserCount, check_useractivityinterval_by_user_query, None),
CountStat.DAY, interval=timedelta(days=15)-timedelta(minutes=15)),
CountStat('minutes_active::day', DataCollector(UserCount, do_pull_minutes_active), CountStat.DAY)
CountStat('minutes_active::day', DataCollector(UserCount, do_pull_minutes_active), CountStat.DAY),
# Canonical account of the number of active humans in a realm on each day.
DependentCountStat('realm_active_humans::day',
sql_data_collector(RealmCount, count_realm_active_humans_query, None),
CountStat.DAY,
dependencies=['active_users_audit:is_bot:day', '15day_actives::day'])
]
COUNT_STATS = {stat.property: stat for stat in count_stats_}
COUNT_STATS = OrderedDict([(stat.property, stat) for stat in count_stats_])

View File

@ -9,17 +9,19 @@ from django.utils import timezone
from analytics.lib.counts import CountStat, COUNT_STATS, process_count_stat, \
do_fill_count_stat_at_hour, do_increment_logging_stat, DataCollector, \
sql_data_collector, LoggingCountStat, do_aggregate_to_summary_table, \
do_drop_all_analytics_tables
do_drop_all_analytics_tables, DependentCountStat
from analytics.models import BaseCount, InstallationCount, RealmCount, \
UserCount, StreamCount, FillState, Anomaly, installation_epoch
UserCount, StreamCount, FillState, Anomaly, installation_epoch, \
last_successful_fill
from zerver.lib.actions import do_create_user, do_deactivate_user, \
do_activate_user, do_reactivate_user
do_activate_user, do_reactivate_user, update_user_activity_interval
from zerver.lib.timestamp import floor_to_day
from zerver.models import Realm, UserProfile, Message, Stream, Recipient, \
Huddle, Client, UserActivityInterval, RealmAuditLog, \
get_user_profile_by_email, get_client
from datetime import datetime, timedelta
import ujson
from six.moves import range
from typing import Any, Dict, List, Optional, Text, Tuple, Type, Union
@ -238,6 +240,42 @@ class TestProcessCountStat(AnalyticsTestCase):
self.assertTableState(InstallationCount, ['property', 'value'],
[[user_stat.property, 6], [stream_stat.property, 6], [realm_stat.property, 6]])
def test_process_dependent_stat(self):
# type: () -> None
stat1 = self.make_dummy_count_stat('stat1')
stat2 = self.make_dummy_count_stat('stat2')
query = """INSERT INTO analytics_realmcount (realm_id, value, property, end_time)
VALUES (%s, 1, '%s', %%%%(time_end)s)""" % (self.default_realm.id, 'stat3')
stat3 = DependentCountStat('stat3', sql_data_collector(RealmCount, query, None), CountStat.HOUR,
dependencies=['stat1', 'stat2'])
hour = [installation_epoch() + i*self.HOUR for i in range(5)]
# test when one dependency has been run, and the other hasn't
process_count_stat(stat1, hour[2])
process_count_stat(stat3, hour[1])
self.assertTableState(InstallationCount, ['property', 'end_time'],
[['stat1', hour[1]], ['stat1', hour[2]]])
self.assertFillStateEquals(stat3, hour[0])
# test that we don't fill past the fill_to_time argument, even if
# dependencies have later last_successful_fill
process_count_stat(stat2, hour[3])
process_count_stat(stat3, hour[1])
self.assertTableState(InstallationCount, ['property', 'end_time'],
[['stat1', hour[1]], ['stat1', hour[2]],
['stat2', hour[1]], ['stat2', hour[2]], ['stat2', hour[3]],
['stat3', hour[1]]])
self.assertFillStateEquals(stat3, hour[1])
# test that we don't fill past the dependency last_successful_fill times,
# even if fill_to_time is later
process_count_stat(stat3, hour[4])
self.assertTableState(InstallationCount, ['property', 'end_time'],
[['stat1', hour[1]], ['stat1', hour[2]],
['stat2', hour[1]], ['stat2', hour[2]], ['stat2', hour[3]],
['stat3', hour[1]], ['stat3', hour[2]]])
self.assertFillStateEquals(stat3, hour[2])
class TestCountStats(AnalyticsTestCase):
def setUp(self):
# type: () -> None
@ -886,3 +924,104 @@ class TestActiveUsersAudit(AnalyticsTestCase):
user=user, property=self.current_property, subgroup='false',
end_time=end_time, value=1).exists())
self.assertFalse(UserCount.objects.filter(user=user2).exists())
class TestRealmActiveHumans(AnalyticsTestCase):
def setUp(self):
# type: () -> None
super(TestRealmActiveHumans, self).setUp()
self.stat = COUNT_STATS['realm_active_humans::day']
self.current_property = self.stat.property
def mark_audit_active(self, user, end_time=None):
# type: (UserProfile, Optional[datetime]) -> None
if end_time is None:
end_time = self.TIME_ZERO
UserCount.objects.create(
user=user, realm=user.realm, property='active_users_audit:is_bot:day',
subgroup=ujson.dumps(user.is_bot), end_time=end_time, value=1)
def mark_15day_active(self, user, end_time=None):
# type: (UserProfile, Optional[datetime]) -> None
if end_time is None:
end_time = self.TIME_ZERO
UserCount.objects.create(
user=user, realm=user.realm, property='15day_actives::day',
end_time=end_time, value=1)
def test_basic_boolean_logic(self):
# type: () -> None
user = self.create_user()
self.mark_audit_active(user, end_time=self.TIME_ZERO - self.DAY)
self.mark_15day_active(user, end_time=self.TIME_ZERO)
self.mark_audit_active(user, end_time=self.TIME_ZERO + self.DAY)
self.mark_15day_active(user, end_time=self.TIME_ZERO + self.DAY)
for i in [-1, 0, 1]:
do_fill_count_stat_at_hour(self.stat, self.TIME_ZERO + i*self.DAY)
self.assertTableState(RealmCount, ['value', 'end_time'], [[1, self.TIME_ZERO + self.DAY]])
def test_bots_not_counted(self):
# type: () -> None
bot = self.create_user(is_bot=True)
self.mark_audit_active(bot)
self.mark_15day_active(bot)
do_fill_count_stat_at_hour(self.stat, self.TIME_ZERO)
self.assertTableState(RealmCount, [], [])
def test_multiple_users_realms_and_times(self):
# type: () -> None
user1 = self.create_user()
user2 = self.create_user()
second_realm = Realm.objects.create(string_id='second', name='second')
user3 = self.create_user(realm=second_realm)
user4 = self.create_user(realm=second_realm)
user5 = self.create_user(realm=second_realm)
for user in [user1, user2, user3, user4, user5]:
self.mark_audit_active(user)
self.mark_15day_active(user)
for user in [user1, user3, user4]:
self.mark_audit_active(user, end_time=self.TIME_ZERO - self.DAY)
self.mark_15day_active(user, end_time=self.TIME_ZERO - self.DAY)
for i in [-1, 0, 1]:
do_fill_count_stat_at_hour(self.stat, self.TIME_ZERO + i*self.DAY)
self.assertTableState(RealmCount, ['value', 'realm', 'end_time'],
[[2, self.default_realm, self.TIME_ZERO],
[3, second_realm, self.TIME_ZERO],
[1, self.default_realm, self.TIME_ZERO - self.DAY],
[2, second_realm, self.TIME_ZERO - self.DAY]])
# Check that adding spurious entries doesn't make a difference
self.mark_audit_active(user1, end_time=self.TIME_ZERO + self.DAY)
self.mark_15day_active(user2, end_time=self.TIME_ZERO + self.DAY)
self.mark_15day_active(user2, end_time=self.TIME_ZERO - self.DAY)
self.create_user()
third_realm = Realm.objects.create(string_id='third', name='third')
self.create_user(realm=third_realm)
RealmCount.objects.all().delete()
for i in [-1, 0, 1]:
do_fill_count_stat_at_hour(self.stat, self.TIME_ZERO + i*self.DAY)
self.assertTableState(RealmCount, ['value', 'realm', 'end_time'],
[[2, self.default_realm, self.TIME_ZERO],
[3, second_realm, self.TIME_ZERO],
[1, self.default_realm, self.TIME_ZERO - self.DAY],
[2, second_realm, self.TIME_ZERO - self.DAY]])
def test_end_to_end(self):
# type: () -> None
user1 = do_create_user('email1', 'password', self.default_realm, 'full_name', 'short_name')
user2 = do_create_user('email2', 'password', self.default_realm, 'full_name', 'short_name')
do_create_user('email3', 'password', self.default_realm, 'full_name', 'short_name')
time_zero = floor_to_day(timezone.now()) + self.DAY
update_user_activity_interval(user1, time_zero)
update_user_activity_interval(user2, time_zero)
do_deactivate_user(user2)
for property in ['active_users_audit:is_bot:day', '15day_actives::day',
'realm_active_humans::day']:
FillState.objects.create(property=property, state=FillState.DONE, end_time=time_zero)
process_count_stat(COUNT_STATS[property], time_zero+self.DAY)
self.assertEqual(RealmCount.objects.filter(
property='realm_active_humans::day', end_time=time_zero+self.DAY, value=1).count(), 1)
self.assertEqual(RealmCount.objects.filter(property='realm_active_humans::day').count(), 1)