using CSV using DataFrames using StatsBase PRICE_LB = 1.99 PRICE_UB = 5.00 transaction = CSV.read("transaction_data.csv", DataFrame) product = CSV.read("product.csv", DataFrame) demo = CSV.read("hh_demographic.csv", DataFrame) # Find which PRODUCT_IDs are for strawberries strawberry = filter(r -> occursin("STRAWBERR", r.SUB_COMMODITY_DESC) && r.CURR_SIZE_OF_PRODUCT ∈ ("16 OZ", "1 LB"), product) # Filter to strawberry PRODUCT_IDs transaction[!, :UNIT_PRICE] = transaction.SALES_VALUE ./ transaction.QUANTITY strawberry_transactions = filter(r -> r.PRODUCT_ID ∈ strawberry.PRODUCT_ID && PRICE_LB <= r.UNIT_PRICE <= PRICE_UB, transaction) strawberry_transactions = leftjoin(strawberry_transactions, strawberry, on=:PRODUCT_ID) # Find all transactions of households that at least purchased strawberries once households = unique(strawberry_transactions.household_key) all_transactions = filter(r -> r.household_key ∈ households, transaction) gdf = groupby(all_transactions, [:household_key, :BASKET_ID, :DAY]) per_trip = combine(gdf) do df strawberry_purchases = filter(r -> r.PRODUCT_ID ∈ strawberry.PRODUCT_ID && PRICE_LB <= r.UNIT_PRICE <= PRICE_UB, df) if isempty(strawberry_purchases) (price=NaN, outcome=0,) else (price=strawberry_purchases[1, :UNIT_PRICE], outcome=1,) end end per_trip = innerjoin(per_trip, demo, on=:household_key) # Impute prices when no strawberries were purchased on a trip # Use the daily mode of strawberry prices on the most recent day strawberry_per_day = combine(groupby(strawberry_transactions, :DAY)) do df (UNIT_PRICE=mode(df.UNIT_PRICE),) end strawberry_all_days = DataFrame(VIRTUAL_DAY=1:maximum(per_trip.DAY)) full = crossjoin(strawberry_all_days, strawberry_per_day) # For each virtual day, subset to days on or before to find most recent day full_before = full[full.VIRTUAL_DAY .>= full.DAY, :] sort!(full_before, [:VIRTUAL_DAY, :DAY], rev=[false, true]) prices = combine(groupby(full_before, :VIRTUAL_DAY)) do df (UNIT_PRICE_MEAN = df.UNIT_PRICE[1],) end prices = prices[completecases(prices, :UNIT_PRICE_MEAN), :] # Join on the VIRTUAL_DAY to impute allowmissing!(per_trip) per_trip = leftjoin(per_trip, prices, on=(:DAY => :VIRTUAL_DAY)) per_trip[isnan.(per_trip.price), :price] = per_trip[isnan.(per_trip.price), :UNIT_PRICE_MEAN] per_trip = per_trip[completecases(per_trip), :] CSV.write("grocery_pricing.csv", per_trip)