Sample data
As sample data, we use a subset of users over six months from the Google Analytics public dataset. Only the user ID and the date of visit will be used.
-- Sample data: get visit dates for a subset of users on Google Analytics public data
CREATE TABLE IF NOT EXISTS `training.ga_users` AS (
SELECT DISTINCT
fullVisitorId AS user_id,
PARSE_DATE("%Y%m%d", date) AS date
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*`
WHERE
_TABLE_SUFFIX BETWEEN '20170101' AND '20170630'
AND RIGHT(fullVisitorId, 1) = '9'
);
-- Look at first rows
SELECT *
FROM `training.ga_users`
ORDER BY user_id, date
LIMIT 10
| user_id | date |
|---------------------|------------|
| 0001066168651197929 | 2017-06-27 |
| 0001066168651197929 | 2017-06-28 |
| 0001285462512259769 | 2017-06-30 |
| 0001436786657417059 | 2017-05-15 |
| 0002113059720701099 | 2017-01-13 |
| 0003816501532259309 | 2017-01-22 |
| 0004359630748414109 | 2017-01-11 |
| 0004615806807819649 | 2017-03-29 |
| 0004964943560913819 | 2017-02-08 |
| 000614761893770079 | 2017-02-10 |
Returning users each month
The question asked is: each month, how many users were already active on a previous month ?
A strategy is to self join a table containing one row per user and month, with an offset in the months.
- Keep only one row per user and month of activity
- Join this table with itself, on user IDs and previous months < current month
- Group by month and count distinct users
-- Step 1: get one row per user and month of activity
WITH sub1 AS (
SELECT DISTINCT
user_id,
DATE_TRUNC(date, month) AS month
FROM `training.ga_users`
)
-- Step 2: join with same table, on user_id and months before current month
-- COUNT() must be made on DISTINCT user_id, to avoid counting multiple previous months
SELECT
curr.month,
COUNT(DISTINCT prev.user_id) AS returning_users
FROM sub1 AS curr
INNER JOIN sub1 AS prev
ON curr.user_id = prev.user_id
AND prev.month < curr.month
GROUP BY 1
ORDER BY month
| month | returning_users |
|------------|-----------------|
| 2017-02-01 | 205 |
| 2017-03-01 | 245 |
| 2017-04-01 | 261 |
| 2017-05-01 | 312 |
| 2017-06-01 | 329 |
Returning users from previous month
The question is now more detailed: each month, how many users were active the month before ?
Same strategy as for users active any month before, but the join is made exactly on current month = previous month.
-- Step 1: get one row per user and month of activity
WITH sub1 AS (
SELECT DISTINCT
user_id,
DATE_TRUNC(date, month) AS month
FROM `training.ga_users`
)
-- Step 2: almost same join as before,
-- but previous activity is precisely the month before
SELECT
curr.month,
COUNT(DISTINCT prev.user_id) AS returning_users
FROM sub1 AS curr
INNER JOIN sub1 AS prev
ON curr.user_id = prev.user_id
AND prev.month = DATE_ADD(curr.month, INTERVAL -1 MONTH)
GROUP BY 1
ORDER BY month
| month | returning_users |
|------------|-----------------|
| 2017-02-01 | 205 |
| 2017-03-01 | 192 |
| 2017-04-01 | 205 |
| 2017-05-01 | 215 |
| 2017-06-01 | 248 |
New users each month
Counting users that never connected before can be done in two simple steps:
- Get the minimum connection date (truncated at month) for each user
- Group by month, and count users
-- Step 1: get first date for each user
WITH sub1 AS (
SELECT
user_id,
MIN(DATE_TRUNC(date, month)) AS month
FROM `training.ga_users`
GROUP BY 1
)
-- Step 2: group and count by month
SELECT
month,
COUNT(*) AS new_users
FROM sub1
GROUP BY 1
ORDER BY month
| month | returning_users |
|------------|-----------------|
| 2017-01-01 | 5267 |
| 2017-02-01 | 4904 |
| 2017-03-01 | 5577 |
| 2017-04-01 | 5287 |
| 2017-05-01 | 4877 |
| 2017-06-01 | 4877 |
Share of new users per month
To get the share of new users every month:
- Get one distinct row per user and month of activity
- Compute all users: on the first table, group by month and count users
- Compute new users: on the first table, get minimum date for each user, then group by month, and count
- Join
all users
table withnew users
table, and compute share of new users
-- Step 1: get one row per user and month of activity
WITH sub1 AS (
SELECT DISTINCT
user_id,
DATE_TRUNC(date, month) AS month
FROM `training.ga_users`
),
-- Step 2: get ALL users per month
au AS (
SELECT
month,
COUNT(*) AS all_users
FROM sub1
GROUP BY 1
),
-- Step 3: get NEW users per month
nu AS (
SELECT
month,
COUNT(*) AS new_users
FROM (
SELECT
user_id,
MIN(month) AS month
FROM sub1
GROUP BY 1
)
GROUP BY 1
)
-- Step 4: join and compute share
SELECT
au.month,
all_users,
new_users,
new_users/all_users AS new_users_share
FROM au
LEFT JOIN nu
USING(month)
ORDER BY month
| month | all_users | new_users | new_users_share |
|------------|-----------|-----------|-----------------|
| 2017-01-01 | 5267 | 5267 | 1.000000 |
| 2017-02-01 | 5109 | 4904 | 0.959875 |
| 2017-03-01 | 5822 | 5577 | 0.957918 |
| 2017-04-01 | 5548 | 5287 | 0.952956 |
| 2017-05-01 | 5189 | 4877 | 0.939873 |
| 2017-06-01 | 5206 | 4877 | 0.936804 |
Cohorts by users first month
To calculate retention over months, by cohorts of users based on their first month:
- Get first month for each user
- Join with users activity, grouped by month
- Compute age of cohort and retention percentage
-- Step 1: group in cohorts, based on user first month
WITH sub1 AS (
SELECT
user_id,
MIN(DATE_TRUNC(date, month)) AS cohort_month
FROM `training.ga_users`
GROUP BY 1
),
-- Step 2: join with users monthly activity and aggregate
sub2 AS (
SELECT
cohort_month,
DATE_TRUNC(date, month) AS month,
COUNT(DISTINCT user_id) AS users,
FROM `training.ga_users`
LEFT JOIN sub1 USING(user_id)
GROUP BY 1,2
)
-- Step 3: aggregate and count
SELECT
cohort_month,
DATE_DIFF(month, cohort_month, MONTH) AS age_months,
users,
users/FIRST_VALUE(users) OVER (PARTITION BY cohort_month ORDER BY month) AS pct_retention
FROM sub2
ORDER BY cohort_month, age_months
cohort_month | age_months | users | pct_retention |
-------------|------------|-------|---------------|
2017-01-01 | 0 | 5267 | 1.000000 |
2017-01-01 | 1 | 205 | 0.038922 |
2017-01-01 | 2 | 98 | 0.018606 |
2017-01-01 | 3 | 62 | 0.011771 |
2017-01-01 | 4 | 49 | 0.009303 |
2017-01-01 | 5 | 34 | 0.006455 |
2017-02-01 | 0 | 4904 | 1.000000 |
2017-02-01 | 1 | 147 | 0.029976 |
2017-02-01 | 2 | 45 | 0.009176 |
2017-02-01 | 3 | 37 | 0.007545 |
2017-02-01 | 4 | 23 | 0.004690 |
2017-03-01 | 0 | 5577 | 1.000000 |
2017-03-01 | 1 | 154 | 0.027613 |
2017-03-01 | 2 | 63 | 0.011296 |
2017-03-01 | 3 | 30 | 0.005379 |
2017-04-01 | 0 | 5287 | 1.000000 |
2017-04-01 | 1 | 163 | 0.030830 |
2017-04-01 | 2 | 53 | 0.010025 |
2017-05-01 | 0 | 4877 | 1.000000 |
2017-05-01 | 1 | 189 | 0.038753 |
2017-06-01 | 0 | 4877 | 1.000000 |