Recommender Vignette

using IteratedProcessSimulations
using Recommendation
using DataFrames
using Soss
using MeasureTheory
using Chain
using DataFrameMacros
using UUIDs
using VegaLite
import Distributions

Simulation Premises

A bookstore has customer account data on previous purchases as well as a monthly newsletter in which it can suggest three books to read, personalized to each customer.

All books cost the same. Each book has latent attributes, quality and topicality, which are fixed. Each customer has unique preferences weighting these two factors and resulting in a utility score. The customer chooses the highest utility book each month, as long as it has a utility greater than 0 (the utility of the most attractive non-book good available and average score across all books and attributes).

New books are released each month. The bookstore uses a collaborative filter to identify optimal books to offer to each user. Once a user has chosen a book, its user-specific utility becomes visible to the bookstore (i.e. a rating).

To gather feedback on newly released books, the bookstore distributes copies to 10% of users in the release month, in exchange for their rating which becomes available instantly. These books, for sake of simplicity can be repurchased (consumed) by the user.

The simulation runs over a course of 36 months.

Simulate User Preferences

n_users = 100
n_books_per_month = 15
n_months = 36
pct_pre_read = 0.1  # X% of new books are 'pre-read' by users

# Define data generating process for the users
user_dgp = @model params begin
        user_utility_weight_quality ~ Distributions.TruncatedNormal(0.5, 0.1, 0, 1)
        user_utility_weight_topicality = 1 - user_utility_weight_quality
end

user_sim_description = DataFrame(
    "n_datapoints" => [n_users],
    "epoch" => [0],
)

1 rows × 2 columns

n_datapointsepoch
Int64Int64
11000

Create user sample

user_attributes = @chain generate_data(user_dgp, user_sim_description) begin
    @transform(:user_id = @c 1:length(:id))
    @select(:user_id, :user_utility_weight_quality, :user_utility_weight_topicality)
end

first(user_attributes, 4)

4 rows × 3 columns

user_iduser_utility_weight_qualityuser_utility_weight_topicality
Int64Float64Float64
110.5144820.485518
220.300250.69975
330.5369370.463063
440.6494630.350537

Define data generating process for the books

book_dgp = @model params begin
        quality ~ Distributions.TruncatedNormal(10, 3, 0, 100)
        topicality ~ Distributions.TruncatedNormal(10, 3, 0, 100)
end

book_sim_description = DataFrame(
    "n_datapoints" => fill(n_books_per_month, 36),
    "epoch" => 1:36,
)

# TODO: remove line, just for testing
book_attributes = generate_data(book_dgp, eachrow(book_sim_description)[2])

first(book_attributes, 4)

4 rows × 6 columns

topicalityqualityidepochobservedpredicted_labels
Float64Float64StringInt64BoolNothing
111.04827.67744ec6fffc4-1ac5-4432-8a71-51896def33ae20
210.120212.81339da84e18-3913-4c53-a18b-155984f2321420
315.06712.395743ee7e8a-a37d-4a34-8a84-0e4cce4b4e8420
410.03499.7109344c41545-8b26-48c6-859b-938813e716ed20

Build user-book dataframe via transform_data function

function transform_data(book_df)

    book_df = @chain book_df @transform(:book_id = @c 1:nrow(book_df)) @transform(:book_id = :book_id + (:epoch - 1) * nrow(book_df))
    user_book_df = @chain book_df begin
        crossjoin(user_attributes)
    end

    user_book_df = @chain user_book_df begin
        @transform(
            :user_book_utility = :topicality * :user_utility_weight_topicality + :quality * :user_utility_weight_quality,
            # X% of new books are 'pre-read' by users
            :pre_read = rand(Bernoulli(pct_pre_read))
                   )
    end

    user_book_df[!, :predicted_utility] .= missing
    user_book_df[!, :predicted_utility] = convert(Vector{Union{Missing, Float64}}, user_book_df[!, :predicted_utility])

    return user_book_df
end

# TODO: remove line, just for testing
new_data = transform_data(book_attributes)

first(new_data, 4)

4 rows × 13 columns

topicalityqualityidepochobservedpredicted_labelsbook_iduser_iduser_utility_weight_qualityuser_utility_weight_topicalityuser_book_utilitypre_readpredicted_utility
Float64Float64StringInt64BoolNothingInt64Int64Float64Float64Float64BoolFloat64?
111.04827.67744ec6fffc4-1ac5-4432-8a71-51896def33ae201610.5144820.4855189.313980missing
211.04827.67744ec6fffc4-1ac5-4432-8a71-51896def33ae201620.300250.6997510.03610missing
311.04827.67744ec6fffc4-1ac5-4432-8a71-51896def33ae201630.5369370.4630639.238290missing
411.04827.67744ec6fffc4-1ac5-4432-8a71-51896def33ae201640.6494630.3505378.8590missing

Define Machine Learning Model

# TODO: remove line, just for testing
training_data = new_data

function convert_dataframe_to_recommender(df::DataFrame, n_users, n_books)
    event_list = []
    for row in eachrow(df)
        # Here we assume that a user knows and reports their utility after having read the book
        push!(event_list, Event(row[:user_id], row[:book_id], row[:user_book_utility]))
    end

    event_list = convert(Vector{Event}, event_list)

    data = DataAccessor(event_list, n_users, n_books)

    return data
end

function fit_model(epoch_parameters::DataFrameRow, training_data::DataFrame, new_data::DataFrame)
        # Note, the statement below permanently adds the new data to the training dataset
        append!(training_data, new_data, promote=true)

        n_users = maximum(training_data[!, :user_id])
        n_books = maximum(training_data[!, :book_id])
        # Drop unobserved outcomes
        training_data = @chain training_data @subset(:observed | :pre_read)


        data = convert_dataframe_to_recommender(training_data, n_users, n_books)
        recommender = SVD(data, 10)
        build!(recommender)

        return recommender
end

fit_model(eachrow(book_sim_description)[1], training_data, new_data)
Recommendation.SVD(Recommendation.DataAccessor(Recommendation.Event[Recommendation.Event(12, 16, 9.515944852840331), Recommendation.Event(16, 16, 9.493420422158717), Recommendation.Event(23, 16, 9.602995885931103), Recommendation.Event(32, 16, 9.631113534034048), Recommendation.Event(46, 16, 9.168662479879151), Recommendation.Event(47, 16, 9.867610889063794), Recommendation.Event(50, 16, 9.575901908098166), Recommendation.Event(51, 16, 9.60576813827057), Recommendation.Event(77, 16, 9.267029916750198), Recommendation.Event(3, 17, 11.566230771558324)  …  Recommendation.Event(41, 30, 7.289597670202056), Recommendation.Event(46, 30, 8.784368443736202), Recommendation.Event(63, 30, 8.54369411183778), Recommendation.Event(67, 30, 8.30240913341008), Recommendation.Event(73, 30, 8.64870749525196), Recommendation.Event(74, 30, 9.134883286370302), Recommendation.Event(81, 30, 7.544380344932471), Recommendation.Event(88, 30, 8.555875586291524), Recommendation.Event(94, 30, 8.39110494863881), Recommendation.Event(96, 30, 8.26293250196835)], [NaN NaN … 17.475438301898762 NaN; NaN NaN … NaN NaN; … ; NaN NaN … NaN NaN; NaN NaN … NaN NaN], Dict{Int64, Any}(), Dict{Int64, Any}()), 10, Union{Nothing, Float64}[0.08355367605437564 -0.026134613266781376 … -0.03420201858791656 -0.05936916372171807; 0.014287208163610676 0.013254255723090372 … -0.0024508458642997152 -0.18533734818760617; … ; 0.096017276367716 0.007876358892420093 … 0.06204078887726933 0.037685255920138896; 0.07799797047469953 -0.02140718919697145 … -0.017981755996201484 0.041090133765272055], Union{Nothing, Float64}[111.62446777282649, 98.71216232697275, 86.0488878712404, 80.33508370095045, 68.45514600033641, 62.27783486412369, 59.389618326006286, 55.48817518904382, 49.818842060548356, 47.05621195159057], Union{Nothing, Float64}[5.598293801596863e-18 5.551115123125783e-17 … 0.0995632302193775 0.24200448889416784; -1.9801356300431266e-18 -1.3877787807814457e-17 … 0.006917464946033532 0.10589729480741074; … ; -5.134612071840844e-18 2.0816681711721685e-17 … 0.1882824258847828 -0.06866977834290362; -9.436120576798104e-18 -2.7755575615628914e-17 … 0.04873726867452397 -0.06033393166076437])
# Skip using this to track parameter / model outcomes for now, but could be useful in a real study...
function summarize_model(epoch_parameters::DataFrameRow, model, simulation_data::DataFrame, new_data::DataFrame)
    DataFrame(:epoch => [epoch_parameters.epoch])
end
summarize_model (generic function with 1 method)
function choose_observations(epoch_parameters::DataFrameRow, recommender, new_data::DataFrame, simulation_data::DataFrame)
    # NOTE: as the new_data is already added to the simulation data during the model fit, no need to use `new_data` here

    # Each user gets to read an additional book!
    for user_id in unique((@chain simulation_data @subset(!:observed) _[!, :user_id]))
        user_prediction = recommend(recommender, user_id, 1, (@chain simulation_data @subset(!:observed & (:user_id == user_id)) @select(:book_id) unique _[!, :book_id]))
        best_book = user_prediction[1][1]
        best_book_score = user_prediction[1][2]
        simulation_data[((simulation_data[!, :user_id] .== user_id) .& (simulation_data[!, :book_id] .== best_book)), :observed] .= true
        simulation_data[((simulation_data[!, :user_id] .== user_id) .& (simulation_data[!, :book_id] .== best_book)), :predicted_utility] .= best_book_score
    end

    return simulation_data
end
choose_observations (generic function with 1 method)

Put it all together and run the simulation

ips = IteratedProcessSimulation(book_dgp, book_sim_description, transform_data, fit_model, summarize_model, choose_observations)

simulation_data, model_summary, model_objects = run_simulation(ips)

# TODO: for debugging, remove
user_id = 1
recommender = model_objects[36]
Recommendation.SVD(Recommendation.DataAccessor(Recommendation.Event[Recommendation.Event(24, 1, 11.07392731435876), Recommendation.Event(25, 1, 11.074849799492153), Recommendation.Event(57, 1, 11.109913085446344), Recommendation.Event(59, 1, 11.142106611631375), Recommendation.Event(79, 1, 11.119510585570426), Recommendation.Event(23, 2, 5.768074613645223), Recommendation.Event(74, 2, 6.1963439009825), Recommendation.Event(76, 2, 5.924029832580604), Recommendation.Event(94, 2, 5.896485270942696), Recommendation.Event(2, 3, 11.200770141573434)  …  Recommendation.Event(90, 539, 9.165672388410753), Recommendation.Event(8, 540, 9.953628903306619), Recommendation.Event(28, 540, 10.7306881051593), Recommendation.Event(38, 540, 10.816388977593874), Recommendation.Event(40, 540, 10.168435562792494), Recommendation.Event(45, 540, 11.773586617133558), Recommendation.Event(51, 540, 10.312538762054634), Recommendation.Event(69, 540, 9.760542212456357), Recommendation.Event(91, 540, 11.033963853145643), Recommendation.Event(93, 540, 11.647403808749218)], [NaN NaN … NaN NaN; NaN NaN … NaN NaN; … ; NaN NaN … NaN NaN; NaN NaN … NaN NaN], Dict{Int64, Any}(), Dict{Int64, Any}()), 10, Union{Nothing, Float64}[-0.10012638500267834 0.05003880953603024 … 0.0546905062029917 -0.1236259502818227; -0.08901207948256673 -0.08820804020876574 … 0.046116632169862865 0.04632167212184227; … ; -0.0907958147261419 -0.1396938984031464 … -0.07225485477743088 -0.059644101304259595; -0.10269904625520226 -0.01410229324663866 … 0.03393296014555445 -0.10480942068302372], Union{Nothing, Float64}[479.6308410621238, 222.33296526141234, 201.96185560951434, 188.9747196744335, 180.13862696299518, 176.72263058535552, 168.63397879087532, 161.5344635777463, 156.54855950652282, 141.5109887209243], Union{Nothing, Float64}[-0.0101297099242509 -0.004864486434046431 … -0.025525359715715183 -0.020444117239746224; -0.023960936692049087 -0.0046500001057154005 … 0.008263252736047624 0.014640475326243085; … ; 0.026143996759117095 -0.005506422001605704 … 0.05311404896282505 0.007459616471914163; 0.01348334352556254 -0.005308522160959778 … 0.0295702761593563 -0.010224011208820728])

Assess outcome quality

utility_rollup = @chain simulation_data begin
    @groupby(:user_id, :user_utility_weight_quality, :user_utility_weight_topicality)
    @combine(:user_utility_achieved = sum(:user_book_utility[:observed]),
             :user_utility_predicted = sum(:predicted_utility[:observed]), # this should be strictly positive
             :n_books_purchased = length(:predicted_utility[:observed]),
             :user_utility_possible = @c sum(sort(:user_book_utility, rev=true)[1:n_months]) # user has the possibility of choosing X books = n_months
             )
    @transform(:pct_utility_achieved = :user_utility_achieved / :user_utility_possible)
end

first(utility_rollup, 6)

6 rows × 8 columns

user_iduser_utility_weight_qualityuser_utility_weight_topicalityuser_utility_achieveduser_utility_predictedn_books_purchaseduser_utility_possiblepct_utility_achieved
Int64Float64Float64Float64Float64Int64Float64Float64
110.5144820.485518425.229203.88836504.8970.84221
220.300250.69975433.973234.01436517.5260.838553
330.5369370.463063424.728175.27836505.0430.840973
440.6494630.350537420.248170.70736509.8160.824314
550.5034360.496564422.792223.80336504.8670.837432
660.5367990.463201429.37233.21136505.0420.850166

Plot Utility Distribution across Users

utility_rollup |> @vlplot(:bar, width=500, height=300, x={:user_utility_achieved, bin={step=0.5}, title="Total Utility Achieved"}, y={"count()", title="User Count"}, title="Utility Achieved per User")

Plot Predicted Utility vs Actual Utility

utility_rollup |> @vlplot(:point, width=500, height=500, x={:user_utility_achieved, title="Total Utility Achieved"}, y={:user_utility_possible, title="Total Utility Possible"}, title="Model Relatively Ineffective", resolve={scale={x=:independent, y=:independent}})

Plot Percent Utility Achieved across Users

utility_rollup |> @vlplot(width=500, height=300, :bar, x={:pct_utility_achieved, bin={step=0.005}, title="Percent Utility Achieved", axis={format="%"}}, y={"count()", title="User Count"}, title="Percentage of Possible Utility Achieved per User")

Plot User Preferences

utility_rollup |> @vlplot(width=500, height=300, :bar, x={:user_utility_weight_quality, bin={step=0.05}, title="User Preference for Quality (over Topicality)", axis={format="%"}}, y={"count()", title="User Count"}, title="User Preferences, Percentage Weight Quality (vs Topicality)")

Plot Individual Preferences Against Total Utility Possible

utility_rollup |> @vlplot(width=500, height=300, :bar, x={:user_utility_weight_quality, bin={step=0.01}, title="User Preference for Quality (over Topicality)", axis={format="%"}}, y={"mean(user_utility_possible)", title="Possible Utility"}, title="Possible Utility by User Preferences")

Plot Individual Preferences Against Utility

utility_rollup |> @vlplot(width=500, height=300, :bar, x={:user_utility_weight_quality, bin={step=0.01}, title="User Preference for Quality (over Topicality)", axis={format="%"}}, y={"mean(pct_utility_achieved)", title="Average Percent Utility Achieved", axis={format="%"}}, title="Percentage of Possible Utility Achieved by User Preferences")
utility_rollup |> @vlplot(width=500, height=300, :bar, x={:user_utility_weight_quality, bin={step=0.01}, title="User Preference for Quality (over Topicality)", axis={format="%"}}, y={"mean(user_utility_achieved)", title="Average Utility Achieved"}, title="Average Utility Achieved by User Preferences")