Skip to content

Commit 7961f4f

Browse files
authored
Merge pull request #214 from appsembler/john/rework-course-progress
Enrollment Metrics Rework
2 parents 9eb0331 + d40dc78 commit 7961f4f

File tree

5 files changed

+650
-16
lines changed

5 files changed

+650
-16
lines changed

figures/pipeline/course_daily_metrics.py

+13-4
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from figures.models import CourseDailyMetrics, PipelineError
2929
from figures.pipeline.logger import log_error
3030
import figures.pipeline.loaders
31+
from figures.pipeline.enrollment_metrics import bulk_calculate_course_progress_data
3132
from figures.serializers import CourseIndexSerializer
3233
from figures.compat import GeneratedCertificate
3334
import figures.sites
@@ -81,7 +82,7 @@ def get_active_learner_ids_today(course_id, date_for):
8182
).values_list('student__id', flat=True).distinct()
8283

8384

84-
def get_average_progress(course_id, date_for, course_enrollments):
85+
def get_average_progress_deprecated(course_id, date_for, course_enrollments):
8586
"""Collects and aggregates raw course grades data
8687
"""
8788
progress = []
@@ -225,6 +226,9 @@ def extract(self, course_id, date_for=None, **_kwargs):
225226
average_days_to_complete=data.get('average_days_to_complete, None'),
226227
num_learners_completed=data['num_learners_completed'],
227228
)
229+
TODO: refactor this class
230+
Add lazy loading method to load course enrollments
231+
- Create a method for each metric field
228232
"""
229233

230234
# Update args if not assigned
@@ -247,18 +251,23 @@ def extract(self, course_id, date_for=None, **_kwargs):
247251
# we can do a lambda for course_enrollments to get the count
248252

249253
data['enrollment_count'] = course_enrollments.count()
254+
250255
active_learner_ids_today = get_active_learner_ids_today(
251256
course_id, date_for,)
252257
if active_learner_ids_today:
253258
active_learners_today = active_learner_ids_today.count()
254259
else:
255260
active_learners_today = 0
256-
257261
data['active_learners_today'] = active_learners_today
258-
data['average_progress'] = get_average_progress(
259-
course_id, date_for, course_enrollments,)
262+
263+
# Average progress
264+
progress_data = bulk_calculate_course_progress_data(course_id=course_id,
265+
date_for=date_for)
266+
data['average_progress'] = progress_data['average_progress']
267+
260268
data['average_days_to_complete'] = get_average_days_to_complete(
261269
course_id, date_for,)
270+
262271
data['num_learners_completed'] = get_num_learners_completed(
263272
course_id, date_for,)
264273

+280
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,280 @@
1+
"""This module collects metrics for a course enrollment (learner/course pair)
2+
3+
# Overview
4+
5+
This is new code for Figures 0.3.10. Its purpose is to address pipeline
6+
performance by reducing calls to `CourseGradeFactory().read.
7+
8+
Calls are reduced by checking if the learner has a `StudentModule` record
9+
modified after the most recent `LearnerCourseGradeMetrics` record `date_for`
10+
field
11+
12+
13+
# Logic
14+
15+
```
16+
17+
for each learner+course # See bulk_calculate_course_progress_data
18+
get newest lgcm record
19+
get newest sm record
20+
21+
# Check if update needed, see _enrollment_metrics_needs_update
22+
if not lcgm and not sm
23+
# Learner has not started course
24+
skip collection for this learner+course
25+
else if lcgm and sm
26+
# we have saved history so we can check to update
27+
if lcgm.modified is older than sm.modified
28+
collect new course data # See _collect_progress_data
29+
return new metrics object created
30+
31+
else if not lcgm and sm
32+
# Learner started course after last collection action
33+
collect new course data # See _collect_progress_data
34+
return new metrics object created
35+
36+
else # lcgm and not sm
37+
# We had collected data previously, but the upstream went away
38+
# This could happen with data cleansing, like GPPR requests
39+
# Likely other causes, too
40+
log condition to figures PipelineError log
41+
return None
42+
```
43+
44+
# NOTE: We plan to change 'lcgm' in this file to either 'enrollment' or 'LP'
45+
(for Learner Progress) when an abbreviation is acceptible. This is to get
46+
this module forward ready for reworking `LearnerCourseGradeMetrics`
47+
"""
48+
49+
from datetime import datetime
50+
from decimal import Decimal
51+
52+
from django.utils.timezone import utc
53+
54+
from figures.metrics import LearnerCourseGrades
55+
from figures.models import LearnerCourseGradeMetrics, PipelineError
56+
from figures.pipeline.logger import log_error
57+
from figures.sites import (
58+
get_site_for_course,
59+
course_enrollments_for_course,
60+
student_modules_for_course_enrollment,
61+
UnlinkedCourseError,
62+
)
63+
64+
65+
def bulk_calculate_course_progress_data(course_id, date_for=None):
66+
"""Calculates the average progress for a set of course enrollments
67+
68+
How it works
69+
1. collects progress percent for each course enrollment
70+
1.1. If up to date enrollment metrics record already exists, use that
71+
2. calculate and return the average of these enrollments
72+
73+
TODO: Update to filter on active users
74+
75+
Questions:
76+
- What makes a learner an active learner?
77+
78+
"""
79+
progress_percentages = []
80+
81+
if not date_for:
82+
date_for = datetime.utcnow().replace(tzinfo=utc).date()
83+
84+
for ce in course_enrollments_for_course(course_id):
85+
metrics = collect_metrics_for_enrollment(course_enrollment=ce,
86+
date_for=date_for)
87+
if metrics:
88+
progress_percentages.append(metrics.progress_percent)
89+
else:
90+
# Log this for troubleshooting
91+
error_data = dict(
92+
msg=('Unable to create or retrieve enrollment metrics ' +
93+
'for user {} and course {}'.format(ce.user.username,
94+
str(ce.course_id)))
95+
)
96+
log_error(
97+
error_data=error_data,
98+
error_type=PipelineError.COURSE_DATA,
99+
user=ce.user,
100+
course_id=str(course_id)
101+
)
102+
103+
return dict(
104+
average_progress=calculate_average_progress(progress_percentages),
105+
)
106+
107+
108+
def calculate_average_progress(progress_percentages):
109+
"""Calcuates average progress from a list of values
110+
111+
TODO: How do we want to handle malformed data?
112+
"""
113+
if progress_percentages:
114+
average_progress = float(sum(progress_percentages)) / float(len(progress_percentages))
115+
average_progress = float(Decimal(average_progress).quantize(Decimal('.00')))
116+
else:
117+
average_progress = 0.0
118+
return average_progress
119+
120+
121+
def collect_metrics_for_enrollment(course_enrollment, date_for=None, **_kwargs):
122+
"""Collect metrics for enrollment (learner+course)
123+
124+
This function performs course enrollment merics collection for the given
125+
unique enrollment identified by the learner and course. It is initial code
126+
in refactoring the pipeline for learner progress
127+
128+
Important models here are:
129+
* StudentModule (SM)
130+
* LearnerCourseGradeMetrics (LCGM)
131+
* CourseEnrollment (CE)
132+
133+
# Workflow
134+
135+
* Get date of most recent StudentModule record, for the enrollment record
136+
* Get date of most recent LearnerCourseGradeMetrics record, LCGM, for the
137+
enrollment
138+
* if LCGM does not exist or SM.modified is more recent than LCGM.modified then
139+
* collect course_data via CourseEnrollmentFactory().read(...)
140+
* create new LearnerCourseGradesMetics record with the updated data
141+
* retain reference to tne new record
142+
* else return the existing LearnerCourseGradesMetics record if it exists
143+
* else return None if no existing LearnerCourseGradesMetics record
144+
"""
145+
if not date_for:
146+
date_for = datetime.utcnow().replace(tzinfo=utc).date()
147+
site = get_site_for_course(course_enrollment.course_id)
148+
if not site:
149+
raise UnlinkedCourseError('No site found for course "{}"'.format(
150+
course_enrollment.course_id))
151+
152+
# The following are two different ways to avoide the dreaded error
153+
# "Instance of 'list' has no 'order_by' member (no-member)"
154+
# See: https://github.com/PyCQA/pylint-django/issues/165
155+
student_modules = student_modules_for_course_enrollment(ce=course_enrollment)
156+
if student_modules:
157+
most_recent_sm = student_modules.latest('modified')
158+
else:
159+
most_recent_sm = None
160+
161+
lcgm = LearnerCourseGradeMetrics.objects.filter(
162+
user=course_enrollment.user,
163+
course_id=str(course_enrollment.course_id))
164+
most_recent_lcgm = lcgm.order_by('date_for').last() # pylint: disable=E1101
165+
166+
if _enrollment_metrics_needs_update(most_recent_lcgm, most_recent_sm):
167+
progress_data = _collect_progress_data(most_recent_sm)
168+
metrics = _add_enrollment_metrics_record(site=site,
169+
course_enrollment=course_enrollment,
170+
progress_data=progress_data,
171+
date_for=date_for)
172+
else:
173+
metrics = most_recent_lcgm
174+
return metrics
175+
176+
177+
def _enrollment_metrics_needs_update(most_recent_lcgm, most_recent_sm):
178+
"""Returns True if we need to update our learner progress, False otherwise
179+
180+
See the #Logic section in this module's docstring
181+
182+
If we need to check that the records match the same user and course, we
183+
can do something like:
184+
185+
```
186+
class RecordMismatchError(Exception):
187+
pass
188+
189+
190+
def rec_matches_user_and_course(lcgm, sm):
191+
return lcgm.user == sm.student and lcgm.course_id == sm.course_id
192+
```
193+
194+
And in this function add the check when we have both records:
195+
196+
```
197+
if not rec_matches_user_and_course(most_recent_lcgm, most_recent_sm):
198+
rec_msg = '{}(user_id={}, course_id="{}"'
199+
msg1 = rec_msg.format('lcgm',
200+
most_recent_lcgm.user.id,
201+
most_recent_lcgm.course_id)
202+
msg2 = rec_msg.format('sm',
203+
most_recent_sm.student.id,
204+
most_recent_sm.course_id)
205+
raise RecordMismatchError(msg1 + ':' + msg2)
206+
```
207+
"""
208+
# First assume we need to update the enrollment metrics record
209+
needs_update = True
210+
if not most_recent_lcgm and not most_recent_sm:
211+
# Learner has not started coursework
212+
needs_update = False
213+
elif most_recent_lcgm and most_recent_sm:
214+
# Learner has past course activity
215+
needs_update = most_recent_lcgm.date_for < most_recent_sm.modified.date()
216+
elif not most_recent_lcgm and most_recent_sm:
217+
# No LCGM recs, so Learner started on course after last collection
218+
# This could also happen
219+
# If this is the irst time collection is run for the learner+course
220+
# if an unhandled error prevents LCGM from saving
221+
# if the learner's LCGM recs were deleted
222+
needs_update = True
223+
elif most_recent_lcgm and not most_recent_sm:
224+
# This shouldn't happen. We log this state as an error
225+
226+
# using 'COURSE_DATA' for the pipeline error type. Although we should
227+
# revisit logging and error tracking in Figures to define a clear
228+
# approach that has clear an intuitive contexts for the logging event
229+
#
230+
# We neede to decide:
231+
#
232+
# 1. Is this always an error state? Could this condition happen naturally?
233+
#
234+
# 2. Which error type should be created and what is the most applicable
235+
# context
236+
# Candidates
237+
# - Enrollment (learner+course) context
238+
# - Data state context - Why would we have an LCGM
239+
#
240+
# So we hold off updating PipelineError error choises initially until
241+
# we can think carefully on how we tag pipeline errors
242+
#
243+
error_data = dict(
244+
msg='LearnerCourseGradeMetrics record exists without StudentModule',
245+
)
246+
log_error(
247+
error_data=error_data,
248+
error_type=PipelineError.COURSE_DATA,
249+
user=most_recent_lcgm.user,
250+
course_id=str(most_recent_lcgm.course_id)
251+
)
252+
needs_update = False
253+
return needs_update
254+
255+
256+
def _add_enrollment_metrics_record(site, course_enrollment, progress_data, date_for):
257+
"""Convenience function to save progress metrics to Figures
258+
"""
259+
return LearnerCourseGradeMetrics.objects.create(
260+
site=site,
261+
user=course_enrollment.user,
262+
course_id=str(course_enrollment.course_id),
263+
date_for=date_for,
264+
points_possible=progress_data['points_possible'],
265+
points_earned=progress_data['points_earned'],
266+
sections_worked=progress_data['sections_worked'],
267+
sections_possible=progress_data['count']
268+
)
269+
270+
271+
def _collect_progress_data(student_module):
272+
"""Get new progress data for the learner/course
273+
274+
Uses `figures.metrics.LearnerCourseGrades` to retrieve progress data via
275+
`CourseGradeFactory().read(...)` and calculate progress percentage
276+
"""
277+
lcg = LearnerCourseGrades(user_id=student_module.student_id,
278+
course_id=student_module.course_id)
279+
course_progress_details = lcg.progress()
280+
return course_progress_details

figures/sites.py

+32
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,22 @@ class CourseNotInSiteError(CrossSiteResourceError):
3939
pass
4040

4141

42+
class UnlinkedCourseError(Exception):
43+
"""Raise when we need to fail if we can't get the site for a course
44+
45+
This will happen in multisite mode if the course is not linked to the site.
46+
In Tahoe Hawthorn, we use the Appsembler fork of `edx-organizations` to map
47+
courses to sites. For community and standalone deployments, we don't expect
48+
courses to map to sites, so we just return the app instance's default site.
49+
50+
* This is new to support enrollment metrics rework (May 2020)
51+
* We need to evaluate if we want to make sites.get_site_for_course` to
52+
raise this automatically. But first we need to make sure we have test
53+
coverage to make sure we don't introduce new bugs
54+
"""
55+
pass
56+
57+
4258
def site_to_id(site):
4359
"""
4460
Helper to cast site or site id to id
@@ -181,3 +197,19 @@ def get_student_modules_for_course_in_site(site, course_id):
181197
def get_student_modules_for_site(site):
182198
course_ids = get_course_keys_for_site(site)
183199
return StudentModule.objects.filter(course_id__in=course_ids)
200+
201+
202+
def course_enrollments_for_course(course_id):
203+
"""Return a queryset of all `CourseEnrollment` records for a course
204+
205+
Relies on the fact that course_ids are globally unique
206+
"""
207+
return CourseEnrollment.objects.filter(course_id=as_course_key(course_id))
208+
209+
210+
def student_modules_for_course_enrollment(ce):
211+
"""Return a queryset of all `StudentModule` records for a `CourseEnrollment`1
212+
213+
Relies on the fact that course_ids are globally unique
214+
"""
215+
return StudentModule.objects.filter(student=ce.user, course_id=ce.course_id)

0 commit comments

Comments
 (0)