Skip to content

Download statistics

DownloadStatistics

Bases: Statistics

Class used to implement the download statistics action.

Source code in ckanext/statistics/lib/download_statistics.py
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
class DownloadStatistics(Statistics):
    """
    Class used to implement the download statistics action.
    """

    schema = statistics_downloads_schema()

    def _get_statistics(self, year=None, month=None, resource_id=None):
        """
        Fetch the statistics.

        :param year: (optional, default: None)
        :param month: (optional, default: None)
        :param resource_id: (optional, default: None)
        :returns: dict of stats
        """
        monthly_stats = MonthlyStats(month, year, resource_id)
        self.add_ckanpackager_stats(monthly_stats)
        self.add_versioned_datastore_download_stats(monthly_stats)

        # if no resource_id has been specified we can add the backfill and gbif stats as they aren't
        # filterable by resource ID
        if not resource_id:
            self.add_backfill_stats(backfill_filename, monthly_stats)
            self.add_gbif_stats(monthly_stats, year, month)

        return monthly_stats.as_dict()

    @staticmethod
    def add_gbif_stats(monthly_stats, year=None, month=None):
        """
        Add the GBIF download stats to the monthly stats object.

        :param monthly_stats: the MonthlyStats object to add the GBIF stats to
        :param year: (optional, default: None)
        :param month: (optional, default: None)
        """

        gbif_stats = get_gbif_stats(year, month)
        for result in gbif_stats:
            monthly_stats.update_from_gbif(
                result['month'], result['year'], result['records'], result['events']
            )

    @staticmethod
    def add_ckanpackager_stats(monthly_stats):
        """
        Updates the given MonthlyStats object with the ckan packager download stats.

        Note: this function used to aggregate the statistics in the database using sql, however this
        changed when the versioned datastore download stats were added and now it's all done in
        Python. This is because the versioned datastore download stats are more complicated to parse
        and aggregate as the record counts are stored in a JSONB column using the resource ids as
        keys and the counts as values. Creating an SQL query to aggregate this would probably have
        been possible but it would have been horrible to maintain and therefore handling it in
        python made more sense. To then avoid having two completely different aggregation mechanisms
        in the same area I decided to switch the ckan packager stats aggregation over to python too.

        :param monthly_stats: a MonthlyStats object
        """
        for row in model.Session.query(CKANPackagerStat):
            count = int(row.count) if row.count is not None else 0
            monthly_stats.add(row.inserted_on, row.resource_id, count)

    @staticmethod
    def add_versioned_datastore_download_stats(monthly_stats):
        """
        Updates the given MonthlyStats object with the versioned datastore download
        stats. Only "complete" downloads are counted.

        :param monthly_stats: a MonthlyStats object
        """
        for download in model.Session.query(DownloadRequest).filter(
            DownloadRequest.state == DownloadRequest.state_complete
        ):
            monthly_stats.add_all(
                download.created, download.core_record.resource_totals
            )

    @staticmethod
    def add_backfill_stats(filename, monthly_stats):
        """
        Updates the MonthlyStats object with static data from a json file that can be
        used to fill gaps in the API's returned statistics.

        :param filename: the name of the json file containing the statistics
        :param monthly_stats: a MonthlyStats object
        """
        if filename is None:
            return

        backfill_file = files('ckanext.statistics.data').joinpath(filename)
        backfill_data = json.loads(backfill_file.read_text())

        for year in backfill_data:
            for month, stats in backfill_data[year].items():
                monthly_stats.update_from_backfill(month, year, stats)

add_backfill_stats(filename, monthly_stats) staticmethod

Updates the MonthlyStats object with static data from a json file that can be used to fill gaps in the API's returned statistics.

Parameters:

Name Type Description Default
filename

the name of the json file containing the statistics

required
monthly_stats

a MonthlyStats object

required
Source code in ckanext/statistics/lib/download_statistics.py
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
@staticmethod
def add_backfill_stats(filename, monthly_stats):
    """
    Updates the MonthlyStats object with static data from a json file that can be
    used to fill gaps in the API's returned statistics.

    :param filename: the name of the json file containing the statistics
    :param monthly_stats: a MonthlyStats object
    """
    if filename is None:
        return

    backfill_file = files('ckanext.statistics.data').joinpath(filename)
    backfill_data = json.loads(backfill_file.read_text())

    for year in backfill_data:
        for month, stats in backfill_data[year].items():
            monthly_stats.update_from_backfill(month, year, stats)

add_ckanpackager_stats(monthly_stats) staticmethod

Updates the given MonthlyStats object with the ckan packager download stats.

Note: this function used to aggregate the statistics in the database using sql, however this changed when the versioned datastore download stats were added and now it's all done in Python. This is because the versioned datastore download stats are more complicated to parse and aggregate as the record counts are stored in a JSONB column using the resource ids as keys and the counts as values. Creating an SQL query to aggregate this would probably have been possible but it would have been horrible to maintain and therefore handling it in python made more sense. To then avoid having two completely different aggregation mechanisms in the same area I decided to switch the ckan packager stats aggregation over to python too.

Parameters:

Name Type Description Default
monthly_stats

a MonthlyStats object

required
Source code in ckanext/statistics/lib/download_statistics.py
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
@staticmethod
def add_ckanpackager_stats(monthly_stats):
    """
    Updates the given MonthlyStats object with the ckan packager download stats.

    Note: this function used to aggregate the statistics in the database using sql, however this
    changed when the versioned datastore download stats were added and now it's all done in
    Python. This is because the versioned datastore download stats are more complicated to parse
    and aggregate as the record counts are stored in a JSONB column using the resource ids as
    keys and the counts as values. Creating an SQL query to aggregate this would probably have
    been possible but it would have been horrible to maintain and therefore handling it in
    python made more sense. To then avoid having two completely different aggregation mechanisms
    in the same area I decided to switch the ckan packager stats aggregation over to python too.

    :param monthly_stats: a MonthlyStats object
    """
    for row in model.Session.query(CKANPackagerStat):
        count = int(row.count) if row.count is not None else 0
        monthly_stats.add(row.inserted_on, row.resource_id, count)

add_gbif_stats(monthly_stats, year=None, month=None) staticmethod

Add the GBIF download stats to the monthly stats object.

Parameters:

Name Type Description Default
monthly_stats

the MonthlyStats object to add the GBIF stats to

required
year

(optional, default: None)

None
month

(optional, default: None)

None
Source code in ckanext/statistics/lib/download_statistics.py
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
@staticmethod
def add_gbif_stats(monthly_stats, year=None, month=None):
    """
    Add the GBIF download stats to the monthly stats object.

    :param monthly_stats: the MonthlyStats object to add the GBIF stats to
    :param year: (optional, default: None)
    :param month: (optional, default: None)
    """

    gbif_stats = get_gbif_stats(year, month)
    for result in gbif_stats:
        monthly_stats.update_from_gbif(
            result['month'], result['year'], result['records'], result['events']
        )

add_versioned_datastore_download_stats(monthly_stats) staticmethod

Updates the given MonthlyStats object with the versioned datastore download stats. Only "complete" downloads are counted.

Parameters:

Name Type Description Default
monthly_stats

a MonthlyStats object

required
Source code in ckanext/statistics/lib/download_statistics.py
235
236
237
238
239
240
241
242
243
244
245
246
247
248
@staticmethod
def add_versioned_datastore_download_stats(monthly_stats):
    """
    Updates the given MonthlyStats object with the versioned datastore download
    stats. Only "complete" downloads are counted.

    :param monthly_stats: a MonthlyStats object
    """
    for download in model.Session.query(DownloadRequest).filter(
        DownloadRequest.state == DownloadRequest.state_complete
    ):
        monthly_stats.add_all(
            download.created, download.core_record.resource_totals
        )

MonthlyStats

Bases: object

Class used to keep track of the monthly download counts from multiple sources.

Source code in ckanext/statistics/lib/download_statistics.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
class MonthlyStats(object):
    """
    Class used to keep track of the monthly download counts from multiple sources.
    """

    def __init__(self, month=None, year=None, resource_id=None):
        """
        :param month: if passed, only this month will be counted (defaults to None)
        :param year: if passed, only this year will be counted (defaults to None)
        :param resource_id: if passed, only this resource will be counted (defaults to None)
        """
        self.month = int(month) if month is not None else None
        self.year = int(year) if year is not None else None
        self.resource_id = resource_id

        self.stats = defaultdict(
            lambda: {
                'collections': {
                    'records': 0,
                    'download_events': 0,
                },
                'research': {
                    'records': 0,
                    'download_events': 0,
                },
                'gbif': {
                    'records': 0,
                    'download_events': 0,
                },
            }
        )
        # extract the collection resource ids from the config
        self.collection_resource_ids = toolkit.config.get(
            'ckanext.statistics.resource_ids', set()
        )
        if self.collection_resource_ids:
            self.collection_resource_ids = set(self.collection_resource_ids.split(' '))

    def add(self, date, resource_id, count):
        """
        Updates the stats with the download event information for the given resource and
        count at the given date.

        :param date: the date of the download event
        :param resource_id: the resource downloaded
        :param count: the number of records downloaded
        """
        self.add_all(date, {resource_id: count})

    def add_all(self, date, resource_counts):
        """
        Updates the stats with the download event information for the given resources
        and counts at the given date. This function filters out information about
        months/years/resources we're not interested in based on the parameters passed
        during the construction of this object.

        :param date: the date of the download event
        :param resource_counts: a dict of resource ids -> counts
        """
        month_year = date.strftime('%-m/%Y')
        month, year = map(int, month_year.split('/'))

        # filter the download event
        if self.resource_id is not None:
            if self.resource_id in resource_counts:
                # only update with counts for the resource id requested
                resource_counts = {self.resource_id: resource_counts[self.resource_id]}
            else:
                return
        if self.month is not None and self.month != month:
            return
        if self.year is not None and self.year != year:
            return

        for resource_id, count in resource_counts.items():
            if resource_id in self.collection_resource_ids:
                resource_type = 'collections'
            else:
                resource_type = 'research'
            self.stats[month_year][resource_type]['records'] += count or 0

        resources = set(resource_counts.keys())
        if self.collection_resource_ids.intersection(resources):
            self.stats[month_year]['collections']['download_events'] += 1

        if resources.difference(self.collection_resource_ids):
            self.stats[month_year]['research']['download_events'] += 1

    def update_from_backfill(self, month, year, stats):
        """
        Adds the given stats for the given month and year to our stats dict. The month
        and year filters will be applied if applicable.

        :param month: the month
        :param year: the year
        :param stats: the stats in a dict, the format of this dict must match a
            month/year entry in the self.stats dict
        """
        if self.month is not None and self.month != int(month):
            return
        if self.year is not None and self.year != int(year):
            return

        month_year = f'{month}/{year}'
        for group in ('collections', 'research'):
            for count_name in ('download_events', 'records'):
                self.stats[month_year][group][count_name] += stats.get(group, {}).get(
                    count_name, 0
                )

    def update_from_gbif(self, month, year, records, download_events):
        """
        Add gbif stats to this stats object.

        :param month: the month
        :param year: the year
        :param records: the number of records downloaded
        :param download_events: the number of download events
        """
        month = int(month)
        year = int(year)
        if self.month is not None and self.month != month:
            return
        if self.year is not None and self.year != year:
            return

        month_year = f'{month}/{year}'
        records = int(records if records is not None else 0)
        download_events = int(download_events if download_events is not None else 0)
        self.stats[month_year]['gbif']['records'] += records
        self.stats[month_year]['gbif']['download_events'] += download_events

    def as_dict(self):
        """
        Return an OrderedDict of count stats in ascending chronological order.

        :returns: an OrderedDict
        """
        return OrderedDict(
            sorted(
                self.stats.items(),
                key=lambda x: tuple(map(int, reversed(x[0].split('/')))),
            )
        )

__init__(month=None, year=None, resource_id=None)

Parameters:

Name Type Description Default
month

if passed, only this month will be counted (defaults to None)

None
year

if passed, only this year will be counted (defaults to None)

None
resource_id

if passed, only this resource will be counted (defaults to None)

None
Source code in ckanext/statistics/lib/download_statistics.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def __init__(self, month=None, year=None, resource_id=None):
    """
    :param month: if passed, only this month will be counted (defaults to None)
    :param year: if passed, only this year will be counted (defaults to None)
    :param resource_id: if passed, only this resource will be counted (defaults to None)
    """
    self.month = int(month) if month is not None else None
    self.year = int(year) if year is not None else None
    self.resource_id = resource_id

    self.stats = defaultdict(
        lambda: {
            'collections': {
                'records': 0,
                'download_events': 0,
            },
            'research': {
                'records': 0,
                'download_events': 0,
            },
            'gbif': {
                'records': 0,
                'download_events': 0,
            },
        }
    )
    # extract the collection resource ids from the config
    self.collection_resource_ids = toolkit.config.get(
        'ckanext.statistics.resource_ids', set()
    )
    if self.collection_resource_ids:
        self.collection_resource_ids = set(self.collection_resource_ids.split(' '))

add(date, resource_id, count)

Updates the stats with the download event information for the given resource and count at the given date.

Parameters:

Name Type Description Default
date

the date of the download event

required
resource_id

the resource downloaded

required
count

the number of records downloaded

required
Source code in ckanext/statistics/lib/download_statistics.py
63
64
65
66
67
68
69
70
71
72
def add(self, date, resource_id, count):
    """
    Updates the stats with the download event information for the given resource and
    count at the given date.

    :param date: the date of the download event
    :param resource_id: the resource downloaded
    :param count: the number of records downloaded
    """
    self.add_all(date, {resource_id: count})

add_all(date, resource_counts)

Updates the stats with the download event information for the given resources and counts at the given date. This function filters out information about months/years/resources we're not interested in based on the parameters passed during the construction of this object.

Parameters:

Name Type Description Default
date

the date of the download event

required
resource_counts

a dict of resource ids -> counts

required
Source code in ckanext/statistics/lib/download_statistics.py
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
def add_all(self, date, resource_counts):
    """
    Updates the stats with the download event information for the given resources
    and counts at the given date. This function filters out information about
    months/years/resources we're not interested in based on the parameters passed
    during the construction of this object.

    :param date: the date of the download event
    :param resource_counts: a dict of resource ids -> counts
    """
    month_year = date.strftime('%-m/%Y')
    month, year = map(int, month_year.split('/'))

    # filter the download event
    if self.resource_id is not None:
        if self.resource_id in resource_counts:
            # only update with counts for the resource id requested
            resource_counts = {self.resource_id: resource_counts[self.resource_id]}
        else:
            return
    if self.month is not None and self.month != month:
        return
    if self.year is not None and self.year != year:
        return

    for resource_id, count in resource_counts.items():
        if resource_id in self.collection_resource_ids:
            resource_type = 'collections'
        else:
            resource_type = 'research'
        self.stats[month_year][resource_type]['records'] += count or 0

    resources = set(resource_counts.keys())
    if self.collection_resource_ids.intersection(resources):
        self.stats[month_year]['collections']['download_events'] += 1

    if resources.difference(self.collection_resource_ids):
        self.stats[month_year]['research']['download_events'] += 1

as_dict()

Return an OrderedDict of count stats in ascending chronological order.

Returns:

Type Description

an OrderedDict

Source code in ckanext/statistics/lib/download_statistics.py
157
158
159
160
161
162
163
164
165
166
167
168
def as_dict(self):
    """
    Return an OrderedDict of count stats in ascending chronological order.

    :returns: an OrderedDict
    """
    return OrderedDict(
        sorted(
            self.stats.items(),
            key=lambda x: tuple(map(int, reversed(x[0].split('/')))),
        )
    )

update_from_backfill(month, year, stats)

Adds the given stats for the given month and year to our stats dict. The month and year filters will be applied if applicable.

Parameters:

Name Type Description Default
month

the month

required
year

the year

required
stats

the stats in a dict, the format of this dict must match a month/year entry in the self.stats dict

required
Source code in ckanext/statistics/lib/download_statistics.py
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
def update_from_backfill(self, month, year, stats):
    """
    Adds the given stats for the given month and year to our stats dict. The month
    and year filters will be applied if applicable.

    :param month: the month
    :param year: the year
    :param stats: the stats in a dict, the format of this dict must match a
        month/year entry in the self.stats dict
    """
    if self.month is not None and self.month != int(month):
        return
    if self.year is not None and self.year != int(year):
        return

    month_year = f'{month}/{year}'
    for group in ('collections', 'research'):
        for count_name in ('download_events', 'records'):
            self.stats[month_year][group][count_name] += stats.get(group, {}).get(
                count_name, 0
            )

update_from_gbif(month, year, records, download_events)

Add gbif stats to this stats object.

Parameters:

Name Type Description Default
month

the month

required
year

the year

required
records

the number of records downloaded

required
download_events

the number of download events

required
Source code in ckanext/statistics/lib/download_statistics.py
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
def update_from_gbif(self, month, year, records, download_events):
    """
    Add gbif stats to this stats object.

    :param month: the month
    :param year: the year
    :param records: the number of records downloaded
    :param download_events: the number of download events
    """
    month = int(month)
    year = int(year)
    if self.month is not None and self.month != month:
        return
    if self.year is not None and self.year != year:
        return

    month_year = f'{month}/{year}'
    records = int(records if records is not None else 0)
    download_events = int(download_events if download_events is not None else 0)
    self.stats[month_year]['gbif']['records'] += records
    self.stats[month_year]['gbif']['download_events'] += download_events