Recommender Vignette
using IteratedProcessSimulations
using Recommendation
using DataFrames
using Soss
using MeasureTheory
using Chain
using DataFrameMacros
using UUIDs
using VegaLite
import Distributions
Simulation Premises
A bookstore has customer account data on previous purchases as well as a monthly newsletter in which it can suggest three books to read, personalized to each customer.
All books cost the same. Each book has latent attributes, quality and topicality, which are fixed. Each customer has unique preferences weighting these two factors and resulting in a utility score. The customer chooses the highest utility book each month, as long as it has a utility greater than 0 (the utility of the most attractive non-book good available and average score across all books and attributes).
New books are released each month. The bookstore uses a collaborative filter to identify optimal books to offer to each user. Once a user has chosen a book, its user-specific utility becomes visible to the bookstore (i.e. a rating).
To gather feedback on newly released books, the bookstore distributes copies to 10% of users in the release month, in exchange for their rating which becomes available instantly. These books, for sake of simplicity can be repurchased (consumed) by the user.
The simulation runs over a course of 36 months.
Simulate User Preferences
n_users = 100
n_books_per_month = 15
n_months = 36
pct_pre_read = 0.1 # X% of new books are 'pre-read' by users
# Define data generating process for the users
user_dgp = @model params begin
user_utility_weight_quality ~ Distributions.TruncatedNormal(0.5, 0.1, 0, 1)
user_utility_weight_topicality = 1 - user_utility_weight_quality
end
user_sim_description = DataFrame(
"n_datapoints" => [n_users],
"epoch" => [0],
)
1 rows × 2 columns
n_datapoints | epoch | |
---|---|---|
Int64 | Int64 | |
1 | 100 | 0 |
Create user sample
user_attributes = @chain generate_data(user_dgp, user_sim_description) begin
@transform(:user_id = @c 1:length(:id))
@select(:user_id, :user_utility_weight_quality, :user_utility_weight_topicality)
end
first(user_attributes, 4)
4 rows × 3 columns
user_id | user_utility_weight_quality | user_utility_weight_topicality | |
---|---|---|---|
Int64 | Float64 | Float64 | |
1 | 1 | 0.514482 | 0.485518 |
2 | 2 | 0.30025 | 0.69975 |
3 | 3 | 0.536937 | 0.463063 |
4 | 4 | 0.649463 | 0.350537 |
Define data generating process for the books
book_dgp = @model params begin
quality ~ Distributions.TruncatedNormal(10, 3, 0, 100)
topicality ~ Distributions.TruncatedNormal(10, 3, 0, 100)
end
book_sim_description = DataFrame(
"n_datapoints" => fill(n_books_per_month, 36),
"epoch" => 1:36,
)
# TODO: remove line, just for testing
book_attributes = generate_data(book_dgp, eachrow(book_sim_description)[2])
first(book_attributes, 4)
4 rows × 6 columns
topicality | quality | id | epoch | observed | predicted_labels | |
---|---|---|---|---|---|---|
Float64 | Float64 | String | Int64 | Bool | Nothing | |
1 | 11.0482 | 7.67744 | ec6fffc4-1ac5-4432-8a71-51896def33ae | 2 | 0 | |
2 | 10.1202 | 12.8133 | 9da84e18-3913-4c53-a18b-155984f23214 | 2 | 0 | |
3 | 15.067 | 12.3957 | 43ee7e8a-a37d-4a34-8a84-0e4cce4b4e84 | 2 | 0 | |
4 | 10.0349 | 9.71093 | 44c41545-8b26-48c6-859b-938813e716ed | 2 | 0 |
Build user-book dataframe via transform_data
function
function transform_data(book_df)
book_df = @chain book_df @transform(:book_id = @c 1:nrow(book_df)) @transform(:book_id = :book_id + (:epoch - 1) * nrow(book_df))
user_book_df = @chain book_df begin
crossjoin(user_attributes)
end
user_book_df = @chain user_book_df begin
@transform(
:user_book_utility = :topicality * :user_utility_weight_topicality + :quality * :user_utility_weight_quality,
# X% of new books are 'pre-read' by users
:pre_read = rand(Bernoulli(pct_pre_read))
)
end
user_book_df[!, :predicted_utility] .= missing
user_book_df[!, :predicted_utility] = convert(Vector{Union{Missing, Float64}}, user_book_df[!, :predicted_utility])
return user_book_df
end
# TODO: remove line, just for testing
new_data = transform_data(book_attributes)
first(new_data, 4)
4 rows × 13 columns
topicality | quality | id | epoch | observed | predicted_labels | book_id | user_id | user_utility_weight_quality | user_utility_weight_topicality | user_book_utility | pre_read | predicted_utility | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Float64 | Float64 | String | Int64 | Bool | Nothing | Int64 | Int64 | Float64 | Float64 | Float64 | Bool | Float64? | |
1 | 11.0482 | 7.67744 | ec6fffc4-1ac5-4432-8a71-51896def33ae | 2 | 0 | 16 | 1 | 0.514482 | 0.485518 | 9.31398 | 0 | missing | |
2 | 11.0482 | 7.67744 | ec6fffc4-1ac5-4432-8a71-51896def33ae | 2 | 0 | 16 | 2 | 0.30025 | 0.69975 | 10.0361 | 0 | missing | |
3 | 11.0482 | 7.67744 | ec6fffc4-1ac5-4432-8a71-51896def33ae | 2 | 0 | 16 | 3 | 0.536937 | 0.463063 | 9.23829 | 0 | missing | |
4 | 11.0482 | 7.67744 | ec6fffc4-1ac5-4432-8a71-51896def33ae | 2 | 0 | 16 | 4 | 0.649463 | 0.350537 | 8.859 | 0 | missing |
Define Machine Learning Model
# TODO: remove line, just for testing
training_data = new_data
function convert_dataframe_to_recommender(df::DataFrame, n_users, n_books)
event_list = []
for row in eachrow(df)
# Here we assume that a user knows and reports their utility after having read the book
push!(event_list, Event(row[:user_id], row[:book_id], row[:user_book_utility]))
end
event_list = convert(Vector{Event}, event_list)
data = DataAccessor(event_list, n_users, n_books)
return data
end
function fit_model(epoch_parameters::DataFrameRow, training_data::DataFrame, new_data::DataFrame)
# Note, the statement below permanently adds the new data to the training dataset
append!(training_data, new_data, promote=true)
n_users = maximum(training_data[!, :user_id])
n_books = maximum(training_data[!, :book_id])
# Drop unobserved outcomes
training_data = @chain training_data @subset(:observed | :pre_read)
data = convert_dataframe_to_recommender(training_data, n_users, n_books)
recommender = SVD(data, 10)
build!(recommender)
return recommender
end
fit_model(eachrow(book_sim_description)[1], training_data, new_data)
Recommendation.SVD(Recommendation.DataAccessor(Recommendation.Event[Recommendation.Event(12, 16, 9.515944852840331), Recommendation.Event(16, 16, 9.493420422158717), Recommendation.Event(23, 16, 9.602995885931103), Recommendation.Event(32, 16, 9.631113534034048), Recommendation.Event(46, 16, 9.168662479879151), Recommendation.Event(47, 16, 9.867610889063794), Recommendation.Event(50, 16, 9.575901908098166), Recommendation.Event(51, 16, 9.60576813827057), Recommendation.Event(77, 16, 9.267029916750198), Recommendation.Event(3, 17, 11.566230771558324) … Recommendation.Event(41, 30, 7.289597670202056), Recommendation.Event(46, 30, 8.784368443736202), Recommendation.Event(63, 30, 8.54369411183778), Recommendation.Event(67, 30, 8.30240913341008), Recommendation.Event(73, 30, 8.64870749525196), Recommendation.Event(74, 30, 9.134883286370302), Recommendation.Event(81, 30, 7.544380344932471), Recommendation.Event(88, 30, 8.555875586291524), Recommendation.Event(94, 30, 8.39110494863881), Recommendation.Event(96, 30, 8.26293250196835)], [NaN NaN … 17.475438301898762 NaN; NaN NaN … NaN NaN; … ; NaN NaN … NaN NaN; NaN NaN … NaN NaN], Dict{Int64, Any}(), Dict{Int64, Any}()), 10, Union{Nothing, Float64}[0.08355367605437564 -0.026134613266781376 … -0.03420201858791656 -0.05936916372171807; 0.014287208163610676 0.013254255723090372 … -0.0024508458642997152 -0.18533734818760617; … ; 0.096017276367716 0.007876358892420093 … 0.06204078887726933 0.037685255920138896; 0.07799797047469953 -0.02140718919697145 … -0.017981755996201484 0.041090133765272055], Union{Nothing, Float64}[111.62446777282649, 98.71216232697275, 86.0488878712404, 80.33508370095045, 68.45514600033641, 62.27783486412369, 59.389618326006286, 55.48817518904382, 49.818842060548356, 47.05621195159057], Union{Nothing, Float64}[5.598293801596863e-18 5.551115123125783e-17 … 0.0995632302193775 0.24200448889416784; -1.9801356300431266e-18 -1.3877787807814457e-17 … 0.006917464946033532 0.10589729480741074; … ; -5.134612071840844e-18 2.0816681711721685e-17 … 0.1882824258847828 -0.06866977834290362; -9.436120576798104e-18 -2.7755575615628914e-17 … 0.04873726867452397 -0.06033393166076437])
# Skip using this to track parameter / model outcomes for now, but could be useful in a real study...
function summarize_model(epoch_parameters::DataFrameRow, model, simulation_data::DataFrame, new_data::DataFrame)
DataFrame(:epoch => [epoch_parameters.epoch])
end
summarize_model (generic function with 1 method)
function choose_observations(epoch_parameters::DataFrameRow, recommender, new_data::DataFrame, simulation_data::DataFrame)
# NOTE: as the new_data is already added to the simulation data during the model fit, no need to use `new_data` here
# Each user gets to read an additional book!
for user_id in unique((@chain simulation_data @subset(!:observed) _[!, :user_id]))
user_prediction = recommend(recommender, user_id, 1, (@chain simulation_data @subset(!:observed & (:user_id == user_id)) @select(:book_id) unique _[!, :book_id]))
best_book = user_prediction[1][1]
best_book_score = user_prediction[1][2]
simulation_data[((simulation_data[!, :user_id] .== user_id) .& (simulation_data[!, :book_id] .== best_book)), :observed] .= true
simulation_data[((simulation_data[!, :user_id] .== user_id) .& (simulation_data[!, :book_id] .== best_book)), :predicted_utility] .= best_book_score
end
return simulation_data
end
choose_observations (generic function with 1 method)
Put it all together and run the simulation
ips = IteratedProcessSimulation(book_dgp, book_sim_description, transform_data, fit_model, summarize_model, choose_observations)
simulation_data, model_summary, model_objects = run_simulation(ips)
# TODO: for debugging, remove
user_id = 1
recommender = model_objects[36]
Recommendation.SVD(Recommendation.DataAccessor(Recommendation.Event[Recommendation.Event(24, 1, 11.07392731435876), Recommendation.Event(25, 1, 11.074849799492153), Recommendation.Event(57, 1, 11.109913085446344), Recommendation.Event(59, 1, 11.142106611631375), Recommendation.Event(79, 1, 11.119510585570426), Recommendation.Event(23, 2, 5.768074613645223), Recommendation.Event(74, 2, 6.1963439009825), Recommendation.Event(76, 2, 5.924029832580604), Recommendation.Event(94, 2, 5.896485270942696), Recommendation.Event(2, 3, 11.200770141573434) … Recommendation.Event(90, 539, 9.165672388410753), Recommendation.Event(8, 540, 9.953628903306619), Recommendation.Event(28, 540, 10.7306881051593), Recommendation.Event(38, 540, 10.816388977593874), Recommendation.Event(40, 540, 10.168435562792494), Recommendation.Event(45, 540, 11.773586617133558), Recommendation.Event(51, 540, 10.312538762054634), Recommendation.Event(69, 540, 9.760542212456357), Recommendation.Event(91, 540, 11.033963853145643), Recommendation.Event(93, 540, 11.647403808749218)], [NaN NaN … NaN NaN; NaN NaN … NaN NaN; … ; NaN NaN … NaN NaN; NaN NaN … NaN NaN], Dict{Int64, Any}(), Dict{Int64, Any}()), 10, Union{Nothing, Float64}[-0.10012638500267834 0.05003880953603024 … 0.0546905062029917 -0.1236259502818227; -0.08901207948256673 -0.08820804020876574 … 0.046116632169862865 0.04632167212184227; … ; -0.0907958147261419 -0.1396938984031464 … -0.07225485477743088 -0.059644101304259595; -0.10269904625520226 -0.01410229324663866 … 0.03393296014555445 -0.10480942068302372], Union{Nothing, Float64}[479.6308410621238, 222.33296526141234, 201.96185560951434, 188.9747196744335, 180.13862696299518, 176.72263058535552, 168.63397879087532, 161.5344635777463, 156.54855950652282, 141.5109887209243], Union{Nothing, Float64}[-0.0101297099242509 -0.004864486434046431 … -0.025525359715715183 -0.020444117239746224; -0.023960936692049087 -0.0046500001057154005 … 0.008263252736047624 0.014640475326243085; … ; 0.026143996759117095 -0.005506422001605704 … 0.05311404896282505 0.007459616471914163; 0.01348334352556254 -0.005308522160959778 … 0.0295702761593563 -0.010224011208820728])
Assess outcome quality
utility_rollup = @chain simulation_data begin
@groupby(:user_id, :user_utility_weight_quality, :user_utility_weight_topicality)
@combine(:user_utility_achieved = sum(:user_book_utility[:observed]),
:user_utility_predicted = sum(:predicted_utility[:observed]), # this should be strictly positive
:n_books_purchased = length(:predicted_utility[:observed]),
:user_utility_possible = @c sum(sort(:user_book_utility, rev=true)[1:n_months]) # user has the possibility of choosing X books = n_months
)
@transform(:pct_utility_achieved = :user_utility_achieved / :user_utility_possible)
end
first(utility_rollup, 6)
6 rows × 8 columns
user_id | user_utility_weight_quality | user_utility_weight_topicality | user_utility_achieved | user_utility_predicted | n_books_purchased | user_utility_possible | pct_utility_achieved | |
---|---|---|---|---|---|---|---|---|
Int64 | Float64 | Float64 | Float64 | Float64 | Int64 | Float64 | Float64 | |
1 | 1 | 0.514482 | 0.485518 | 425.229 | 203.888 | 36 | 504.897 | 0.84221 |
2 | 2 | 0.30025 | 0.69975 | 433.973 | 234.014 | 36 | 517.526 | 0.838553 |
3 | 3 | 0.536937 | 0.463063 | 424.728 | 175.278 | 36 | 505.043 | 0.840973 |
4 | 4 | 0.649463 | 0.350537 | 420.248 | 170.707 | 36 | 509.816 | 0.824314 |
5 | 5 | 0.503436 | 0.496564 | 422.792 | 223.803 | 36 | 504.867 | 0.837432 |
6 | 6 | 0.536799 | 0.463201 | 429.37 | 233.211 | 36 | 505.042 | 0.850166 |
Plot Utility Distribution across Users
utility_rollup |> @vlplot(:bar, width=500, height=300, x={:user_utility_achieved, bin={step=0.5}, title="Total Utility Achieved"}, y={"count()", title="User Count"}, title="Utility Achieved per User")
Plot Predicted Utility vs Actual Utility
utility_rollup |> @vlplot(:point, width=500, height=500, x={:user_utility_achieved, title="Total Utility Achieved"}, y={:user_utility_possible, title="Total Utility Possible"}, title="Model Relatively Ineffective", resolve={scale={x=:independent, y=:independent}})
Plot Percent Utility Achieved across Users
utility_rollup |> @vlplot(width=500, height=300, :bar, x={:pct_utility_achieved, bin={step=0.005}, title="Percent Utility Achieved", axis={format="%"}}, y={"count()", title="User Count"}, title="Percentage of Possible Utility Achieved per User")
Plot User Preferences
utility_rollup |> @vlplot(width=500, height=300, :bar, x={:user_utility_weight_quality, bin={step=0.05}, title="User Preference for Quality (over Topicality)", axis={format="%"}}, y={"count()", title="User Count"}, title="User Preferences, Percentage Weight Quality (vs Topicality)")
Plot Individual Preferences Against Total Utility Possible
utility_rollup |> @vlplot(width=500, height=300, :bar, x={:user_utility_weight_quality, bin={step=0.01}, title="User Preference for Quality (over Topicality)", axis={format="%"}}, y={"mean(user_utility_possible)", title="Possible Utility"}, title="Possible Utility by User Preferences")
Plot Individual Preferences Against Utility
utility_rollup |> @vlplot(width=500, height=300, :bar, x={:user_utility_weight_quality, bin={step=0.01}, title="User Preference for Quality (over Topicality)", axis={format="%"}}, y={"mean(pct_utility_achieved)", title="Average Percent Utility Achieved", axis={format="%"}}, title="Percentage of Possible Utility Achieved by User Preferences")
utility_rollup |> @vlplot(width=500, height=300, :bar, x={:user_utility_weight_quality, bin={step=0.01}, title="User Preference for Quality (over Topicality)", axis={format="%"}}, y={"mean(user_utility_achieved)", title="Average Utility Achieved"}, title="Average Utility Achieved by User Preferences")