Recommender Vignette

using IteratedProcessSimulations
using Recommendation
using DataFrames
using Soss
using MeasureTheory
using Chain
using DataFrameMacros
using UUIDs
using VegaLite
import Distributions

Simulation Premises

A bookstore has customer account data on previous purchases as well as a monthly newsletter in which it can suggest three books to read, personalized to each customer.

All books cost the same. Each book has latent attributes, quality and topicality, which are fixed. Each customer has unique preferences weighting these two factors and resulting in a utility score. The customer chooses the highest utility book each month, as long as it has a utility greater than 0 (the utility of the most attractive non-book good available and average score across all books and attributes).

New books are released each month. The bookstore uses a collaborative filter to identify optimal books to offer to each user. Once a user has chosen a book, its user-specific utility becomes visible to the bookstore (i.e. a rating).

To gather feedback on newly released books, the bookstore distributes copies to 10% of users in the release month, in exchange for their rating which becomes available instantly. These books, for sake of simplicity can be repurchased (consumed) by the user.

The simulation runs over a course of 36 months.

Simulate User Preferences

n_users = 100
n_books_per_month = 15
n_months = 36
pct_pre_read = 0.1  # X% of new books are 'pre-read' by users

# Define data generating process for the users
user_dgp = @model params begin
        user_utility_weight_quality ~ Distributions.TruncatedNormal(0.5, 0.1, 0, 1)
        user_utility_weight_topicality = 1 - user_utility_weight_quality
end

user_sim_description = DataFrame(
    "n_datapoints" => [n_users],
    "epoch" => [0],
)

1 rows × 2 columns

	n_datapoints	epoch
	Int64	Int64
1	100	0

Create user sample

user_attributes = @chain generate_data(user_dgp, user_sim_description) begin
    @transform(:user_id = @c 1:length(:id))
    @select(:user_id, :user_utility_weight_quality, :user_utility_weight_topicality)
end

first(user_attributes, 4)

4 rows × 3 columns

	user_id	user_utility_weight_quality	user_utility_weight_topicality
	Int64	Float64	Float64
1	1	0.514482	0.485518
2	2	0.30025	0.69975
3	3	0.536937	0.463063
4	4	0.649463	0.350537

Define data generating process for the books

book_dgp = @model params begin
        quality ~ Distributions.TruncatedNormal(10, 3, 0, 100)
        topicality ~ Distributions.TruncatedNormal(10, 3, 0, 100)
end

book_sim_description = DataFrame(
    "n_datapoints" => fill(n_books_per_month, 36),
    "epoch" => 1:36,
)

# TODO: remove line, just for testing
book_attributes = generate_data(book_dgp, eachrow(book_sim_description)[2])

first(book_attributes, 4)

4 rows × 6 columns

	topicality	quality	id	epoch	observed	predicted_labels
	Float64	Float64	String	Int64	Bool	Nothing
1	11.0482	7.67744	ec6fffc4-1ac5-4432-8a71-51896def33ae	2	0
2	10.1202	12.8133	9da84e18-3913-4c53-a18b-155984f23214	2	0
3	15.067	12.3957	43ee7e8a-a37d-4a34-8a84-0e4cce4b4e84	2	0
4	10.0349	9.71093	44c41545-8b26-48c6-859b-938813e716ed	2	0

Build user-book dataframe via `transform_data` function

function transform_data(book_df)

    book_df = @chain book_df @transform(:book_id = @c 1:nrow(book_df)) @transform(:book_id = :book_id + (:epoch - 1) * nrow(book_df))
    user_book_df = @chain book_df begin
        crossjoin(user_attributes)
    end

    user_book_df = @chain user_book_df begin
        @transform(
            :user_book_utility = :topicality * :user_utility_weight_topicality + :quality * :user_utility_weight_quality,
            # X% of new books are 'pre-read' by users
            :pre_read = rand(Bernoulli(pct_pre_read))
                   )
    end

    user_book_df[!, :predicted_utility] .= missing
    user_book_df[!, :predicted_utility] = convert(Vector{Union{Missing, Float64}}, user_book_df[!, :predicted_utility])

    return user_book_df
end

# TODO: remove line, just for testing
new_data = transform_data(book_attributes)

first(new_data, 4)

4 rows × 13 columns

	topicality	quality	id	epoch	observed	predicted_labels	book_id	user_id	user_utility_weight_quality	user_utility_weight_topicality	user_book_utility	pre_read	predicted_utility
	Float64	Float64	String	Int64	Bool	Nothing	Int64	Int64	Float64	Float64	Float64	Bool	Float64?
1	11.0482	7.67744	ec6fffc4-1ac5-4432-8a71-51896def33ae	2	0		16	1	0.514482	0.485518	9.31398	0	missing
2	11.0482	7.67744	ec6fffc4-1ac5-4432-8a71-51896def33ae	2	0		16	2	0.30025	0.69975	10.0361	0	missing
3	11.0482	7.67744	ec6fffc4-1ac5-4432-8a71-51896def33ae	2	0		16	3	0.536937	0.463063	9.23829	0	missing
4	11.0482	7.67744	ec6fffc4-1ac5-4432-8a71-51896def33ae	2	0		16	4	0.649463	0.350537	8.859	0	missing

Define Machine Learning Model

# TODO: remove line, just for testing
training_data = new_data

function convert_dataframe_to_recommender(df::DataFrame, n_users, n_books)
    event_list = []
    for row in eachrow(df)
        # Here we assume that a user knows and reports their utility after having read the book
        push!(event_list, Event(row[:user_id], row[:book_id], row[:user_book_utility]))
    end

    event_list = convert(Vector{Event}, event_list)

    data = DataAccessor(event_list, n_users, n_books)

    return data
end

function fit_model(epoch_parameters::DataFrameRow, training_data::DataFrame, new_data::DataFrame)
        # Note, the statement below permanently adds the new data to the training dataset
        append!(training_data, new_data, promote=true)

        n_users = maximum(training_data[!, :user_id])
        n_books = maximum(training_data[!, :book_id])
        # Drop unobserved outcomes
        training_data = @chain training_data @subset(:observed | :pre_read)


        data = convert_dataframe_to_recommender(training_data, n_users, n_books)
        recommender = SVD(data, 10)
        build!(recommender)

        return recommender
end

fit_model(eachrow(book_sim_description)[1], training_data, new_data)

Recommendation.SVD(Recommendation.DataAccessor(Recommendation.Event[Recommendation.Event(12, 16, 9.515944852840331), Recommendation.Event(16, 16, 9.493420422158717), Recommendation.Event(23, 16, 9.602995885931103), Recommendation.Event(32, 16, 9.631113534034048), Recommendation.Event(46, 16, 9.168662479879151), Recommendation.Event(47, 16, 9.867610889063794), Recommendation.Event(50, 16, 9.575901908098166), Recommendation.Event(51, 16, 9.60576813827057), Recommendation.Event(77, 16, 9.267029916750198), Recommendation.Event(3, 17, 11.566230771558324)  …  Recommendation.Event(41, 30, 7.289597670202056), Recommendation.Event(46, 30, 8.784368443736202), Recommendation.Event(63, 30, 8.54369411183778), Recommendation.Event(67, 30, 8.30240913341008), Recommendation.Event(73, 30, 8.64870749525196), Recommendation.Event(74, 30, 9.134883286370302), Recommendation.Event(81, 30, 7.544380344932471), Recommendation.Event(88, 30, 8.555875586291524), Recommendation.Event(94, 30, 8.39110494863881), Recommendation.Event(96, 30, 8.26293250196835)], [NaN NaN … 17.475438301898762 NaN; NaN NaN … NaN NaN; … ; NaN NaN … NaN NaN; NaN NaN … NaN NaN], Dict{Int64, Any}(), Dict{Int64, Any}()), 10, Union{Nothing, Float64}[0.08355367605437564 -0.026134613266781376 … -0.03420201858791656 -0.05936916372171807; 0.014287208163610676 0.013254255723090372 … -0.0024508458642997152 -0.18533734818760617; … ; 0.096017276367716 0.007876358892420093 … 0.06204078887726933 0.037685255920138896; 0.07799797047469953 -0.02140718919697145 … -0.017981755996201484 0.041090133765272055], Union{Nothing, Float64}[111.62446777282649, 98.71216232697275, 86.0488878712404, 80.33508370095045, 68.45514600033641, 62.27783486412369, 59.389618326006286, 55.48817518904382, 49.818842060548356, 47.05621195159057], Union{Nothing, Float64}[5.598293801596863e-18 5.551115123125783e-17 … 0.0995632302193775 0.24200448889416784; -1.9801356300431266e-18 -1.3877787807814457e-17 … 0.006917464946033532 0.10589729480741074; … ; -5.134612071840844e-18 2.0816681711721685e-17 … 0.1882824258847828 -0.06866977834290362; -9.436120576798104e-18 -2.7755575615628914e-17 … 0.04873726867452397 -0.06033393166076437])

# Skip using this to track parameter / model outcomes for now, but could be useful in a real study...
function summarize_model(epoch_parameters::DataFrameRow, model, simulation_data::DataFrame, new_data::DataFrame)
    DataFrame(:epoch => [epoch_parameters.epoch])
end

summarize_model (generic function with 1 method)

function choose_observations(epoch_parameters::DataFrameRow, recommender, new_data::DataFrame, simulation_data::DataFrame)
    # NOTE: as the new_data is already added to the simulation data during the model fit, no need to use `new_data` here

    # Each user gets to read an additional book!
    for user_id in unique((@chain simulation_data @subset(!:observed) _[!, :user_id]))
        user_prediction = recommend(recommender, user_id, 1, (@chain simulation_data @subset(!:observed & (:user_id == user_id)) @select(:book_id) unique _[!, :book_id]))
        best_book = user_prediction[1][1]
        best_book_score = user_prediction[1][2]
        simulation_data[((simulation_data[!, :user_id] .== user_id) .& (simulation_data[!, :book_id] .== best_book)), :observed] .= true
        simulation_data[((simulation_data[!, :user_id] .== user_id) .& (simulation_data[!, :book_id] .== best_book)), :predicted_utility] .= best_book_score
    end

    return simulation_data
end

choose_observations (generic function with 1 method)

Put it all together and run the simulation

ips = IteratedProcessSimulation(book_dgp, book_sim_description, transform_data, fit_model, summarize_model, choose_observations)

simulation_data, model_summary, model_objects = run_simulation(ips)

# TODO: for debugging, remove
user_id = 1
recommender = model_objects[36]

Recommendation.SVD(Recommendation.DataAccessor(Recommendation.Event[Recommendation.Event(24, 1, 11.07392731435876), Recommendation.Event(25, 1, 11.074849799492153), Recommendation.Event(57, 1, 11.109913085446344), Recommendation.Event(59, 1, 11.142106611631375), Recommendation.Event(79, 1, 11.119510585570426), Recommendation.Event(23, 2, 5.768074613645223), Recommendation.Event(74, 2, 6.1963439009825), Recommendation.Event(76, 2, 5.924029832580604), Recommendation.Event(94, 2, 5.896485270942696), Recommendation.Event(2, 3, 11.200770141573434)  …  Recommendation.Event(90, 539, 9.165672388410753), Recommendation.Event(8, 540, 9.953628903306619), Recommendation.Event(28, 540, 10.7306881051593), Recommendation.Event(38, 540, 10.816388977593874), Recommendation.Event(40, 540, 10.168435562792494), Recommendation.Event(45, 540, 11.773586617133558), Recommendation.Event(51, 540, 10.312538762054634), Recommendation.Event(69, 540, 9.760542212456357), Recommendation.Event(91, 540, 11.033963853145643), Recommendation.Event(93, 540, 11.647403808749218)], [NaN NaN … NaN NaN; NaN NaN … NaN NaN; … ; NaN NaN … NaN NaN; NaN NaN … NaN NaN], Dict{Int64, Any}(), Dict{Int64, Any}()), 10, Union{Nothing, Float64}[-0.10012638500267834 0.05003880953603024 … 0.0546905062029917 -0.1236259502818227; -0.08901207948256673 -0.08820804020876574 … 0.046116632169862865 0.04632167212184227; … ; -0.0907958147261419 -0.1396938984031464 … -0.07225485477743088 -0.059644101304259595; -0.10269904625520226 -0.01410229324663866 … 0.03393296014555445 -0.10480942068302372], Union{Nothing, Float64}[479.6308410621238, 222.33296526141234, 201.96185560951434, 188.9747196744335, 180.13862696299518, 176.72263058535552, 168.63397879087532, 161.5344635777463, 156.54855950652282, 141.5109887209243], Union{Nothing, Float64}[-0.0101297099242509 -0.004864486434046431 … -0.025525359715715183 -0.020444117239746224; -0.023960936692049087 -0.0046500001057154005 … 0.008263252736047624 0.014640475326243085; … ; 0.026143996759117095 -0.005506422001605704 … 0.05311404896282505 0.007459616471914163; 0.01348334352556254 -0.005308522160959778 … 0.0295702761593563 -0.010224011208820728])

Assess outcome quality

utility_rollup = @chain simulation_data begin
    @groupby(:user_id, :user_utility_weight_quality, :user_utility_weight_topicality)
    @combine(:user_utility_achieved = sum(:user_book_utility[:observed]),
             :user_utility_predicted = sum(:predicted_utility[:observed]), # this should be strictly positive
             :n_books_purchased = length(:predicted_utility[:observed]),
             :user_utility_possible = @c sum(sort(:user_book_utility, rev=true)[1:n_months]) # user has the possibility of choosing X books = n_months
             )
    @transform(:pct_utility_achieved = :user_utility_achieved / :user_utility_possible)
end

first(utility_rollup, 6)

6 rows × 8 columns

	user_id	user_utility_weight_quality	user_utility_weight_topicality	user_utility_achieved	user_utility_predicted	n_books_purchased	user_utility_possible	pct_utility_achieved
	Int64	Float64	Float64	Float64	Float64	Int64	Float64	Float64
1	1	0.514482	0.485518	425.229	203.888	36	504.897	0.84221
2	2	0.30025	0.69975	433.973	234.014	36	517.526	0.838553
3	3	0.536937	0.463063	424.728	175.278	36	505.043	0.840973
4	4	0.649463	0.350537	420.248	170.707	36	509.816	0.824314
5	5	0.503436	0.496564	422.792	223.803	36	504.867	0.837432
6	6	0.536799	0.463201	429.37	233.211	36	505.042	0.850166

Plot Utility Distribution across Users

utility_rollup |> @vlplot(:bar, width=500, height=300, x={:user_utility_achieved, bin={step=0.5}, title="Total Utility Achieved"}, y={"count()", title="User Count"}, title="Utility Achieved per User")

Plot Predicted Utility vs Actual Utility

utility_rollup |> @vlplot(:point, width=500, height=500, x={:user_utility_achieved, title="Total Utility Achieved"}, y={:user_utility_possible, title="Total Utility Possible"}, title="Model Relatively Ineffective", resolve={scale={x=:independent, y=:independent}})

Plot Percent Utility Achieved across Users

utility_rollup |> @vlplot(width=500, height=300, :bar, x={:pct_utility_achieved, bin={step=0.005}, title="Percent Utility Achieved", axis={format="%"}}, y={"count()", title="User Count"}, title="Percentage of Possible Utility Achieved per User")

Plot User Preferences

utility_rollup |> @vlplot(width=500, height=300, :bar, x={:user_utility_weight_quality, bin={step=0.05}, title="User Preference for Quality (over Topicality)", axis={format="%"}}, y={"count()", title="User Count"}, title="User Preferences, Percentage Weight Quality (vs Topicality)")

Plot Individual Preferences Against Total Utility Possible

utility_rollup |> @vlplot(width=500, height=300, :bar, x={:user_utility_weight_quality, bin={step=0.01}, title="User Preference for Quality (over Topicality)", axis={format="%"}}, y={"mean(user_utility_possible)", title="Possible Utility"}, title="Possible Utility by User Preferences")

Plot Individual Preferences Against Utility

utility_rollup |> @vlplot(width=500, height=300, :bar, x={:user_utility_weight_quality, bin={step=0.01}, title="User Preference for Quality (over Topicality)", axis={format="%"}}, y={"mean(pct_utility_achieved)", title="Average Percent Utility Achieved", axis={format="%"}}, title="Percentage of Possible Utility Achieved by User Preferences")

utility_rollup |> @vlplot(width=500, height=300, :bar, x={:user_utility_weight_quality, bin={step=0.01}, title="User Preference for Quality (over Topicality)", axis={format="%"}}, y={"mean(user_utility_achieved)", title="Average Utility Achieved"}, title="Average Utility Achieved by User Preferences")