Using the following data sets, write code to create datafram…
Using the following data sets, write code to create dataframes, join them, and then using a HOF and Lambda function, filter to customers who had more than 2 items in an order. from pyspark.sql import Row sales_sum_data = [ Row(sales_date=’6/17/23′, sale_id=80940, cust_id = 1042, cust_first_name = ‘Ali’, cust_last_name = ‘Walter’, sales_amt = 450, holiday_promo_flag = ‘N’, promo_percent = 0, tax_percent = 6.25, total_amt = 478.13), Row(sales_date=’1/31/24′, sale_id=80685, cust_id = 1046, cust_first_name = ‘Beatriz’, cust_last_name = ‘Chambers’, sales_amt = 125, holiday_promo_flag = ‘N’, promo_percent = 0, tax_percent = 7.10, total_amt = 133.88), Row(sales_date=’5/29/23′, sale_id=80618, cust_id = 1023, cust_first_name = ‘Charles’, cust_last_name = ‘Bell’, sales_amt = 310, holiday_promo_flag = ‘Y’, promo_percent = 15, tax_percent = 5.50, total_amt = 277.99), Row(sales_date=’5/30/23′, sale_id=80430, cust_id = 1010, cust_first_name = ‘Diya’, cust_last_name = ‘Koerner’, sales_amt = 560, holiday_promo_flag = ‘Y’, promo_percent = 15, tax_percent = 6.00, total_amt = 504.56), Row(sales_date=’12/6/23′, sale_id=80013, cust_id = 1088, cust_first_name = ‘Eric’, cust_last_name = ‘Jenkins’, sales_amt = 455, holiday_promo_flag = ‘N’, promo_percent = 0, tax_percent = 5.75, total_amt = 481.16), Row(sales_date=’11/24/23′, sale_id=80885, cust_id = 1046, cust_first_name = ‘Beatriz’, cust_last_name = ‘Chambers’, sales_amt = 230, holiday_promo_flag = ‘Y’, promo_percent = 20, tax_percent = 7.10, total_amt = 197.06), Row(sales_date=’11/24/23′, sale_id=80304, cust_id = 1099, cust_first_name = ‘Fatima’, cust_last_name = ‘Lee’, sales_amt = 670, holiday_promo_flag = ‘Y’, promo_percent = 20, tax_percent = 8.00, total_amt = 578.88), Row(sales_date=’5/26/24′, sale_id=80281, cust_id = 1072, cust_first_name = ‘Gabriel’, cust_last_name = ‘Fraizer’, sales_amt = 500, holiday_promo_flag = ‘Y’, promo_percent = 15, tax_percent = 5.50, total_amt = 448.38), Row(sales_date=’5/27/24′, sale_id=80396, cust_id = 1023, cust_first_name = ‘Charles’, cust_last_name = ‘Bell’, sales_amt = 310, holiday_promo_flag = ‘Y’, promo_percent = 15, tax_percent = 5.50, total_amt = 277.99), Row(sales_date=’2/12/24′, sale_id=80807, cust_id = 1010, cust_first_name = ‘Diya’, cust_last_name = ‘Koerner’, sales_amt = 265, holiday_promo_flag = ‘N’, promo_percent = 0, tax_percent = 6.00, total_amt = 280.90) ] sales_detail_columns = [“sale_item_id”,”sale_id”,”item_line_num”,”item_id”,”item_price”,”shipping_package_id”,”shipping_date”] sales_detail_data = [ (809401,80940,1,457,225,3797,”6/18/23″), (809402,80940,2,457,225,3797,”6/18/23″), (806851,80685,1,547,125,4484,”2/4/24″), (806181,80618,1,432,110,3694,”5/31/23″), (806182,80618,2,478,200,4711,”5/31/23″), (804301,80430,1,585,560,3216,”5/31/23″), (800131,80013,1,463,155,4012,”12/9/23″), (800132,80013,2,564,200,3812,”12/8/23″), (800133,80013,3,461,100,3812,”12/8/23″), (808851,80885,1,595,230,4083,”11/27/23″), (803041,80304,1,457,225,4486,”11/27/23″), (803042,80304,2,588,320,4486,”11/27/23″), (803043,80304,3,547,125,4486,”11/27/23″), (802811,80281,1,470,500,4591,”5/27/24″), (803961,80396,1,432,110,4315,”5/28/24″), (803962,80396,2,478,200,4315,”5/28/24″), (808071,80807,1,422,265,4203, “2/16/24”) ]
Read Details