diff --git a/.data/c360-fashion-retail/generate_complete_dataset.py b/.data/c360-fashion-retail/generate_complete_dataset.py index a74cdc66..693c51b6 100644 --- a/.data/c360-fashion-retail/generate_complete_dataset.py +++ b/.data/c360-fashion-retail/generate_complete_dataset.py @@ -179,12 +179,30 @@ def timestamp_between(self, start_date, end_date): return fake.date_time_between(start_date, end_date) def save_to_csv(self, table_name, data, headers): +<<<<<<< HEAD + """Save data to CSV file with PostgreSQL-compatible formatting""" +======= """Save data to CSV file""" +>>>>>>> origin/main filepath = os.path.join(self.output_dir, f'{table_name}.csv') with open(filepath, 'w', newline='', encoding='utf-8') as csvfile: writer = csv.writer(csvfile) writer.writerow(headers) +<<<<<<< HEAD + + # Convert Python booleans to PostgreSQL-compatible format + for row in data: + formatted_row = [] + for value in row: + if isinstance(value, bool): + formatted_row.append('true' if value else 'false') + else: + formatted_row.append(value) + writer.writerow(formatted_row) + +======= writer.writerows(data) +>>>>>>> origin/main print(f"āœ… Generated {len(data)} rows for {table_name}") self.data[table_name] = data @@ -205,6 +223,23 @@ def generate_brands(self): suffix = random.choice(['Co', 'Brand', 'Label', 'House', 'Studio', 'Collection', 'Design', 'Atelier', 'Boutique', 'Group']) brand_name = f"{base_name} {suffix} {i+1-len(FashionProvider.fashion_brands)}" if i >= len(FashionProvider.fashion_brands) else FashionProvider.fashion_brands[i] +<<<<<<< HEAD + # Create brand data in database schema column order + brand_data = [ + i + 1, # brand_id + brand_name, # brand_name + fake.company() if random.random() < 0.3 else None, # parent_company + self.weighted_choice(tiers, weights), # brand_tier + fake.text(max_nb_chars=200), # description + fake.url(), # website_url + random.randint(1950, 2022), # year_established + random.choice([True, False]) if random.random() < 0.1 else True, # is_active + self.timestamp_between('-5y', '-1y') # created_at + ] + brands.append(brand_data) + + headers = ['brand_id', 'brand_name', 'parent_company', 'brand_tier', 'description', 'website_url', 'year_established', 'is_active', 'created_at'] +======= brand = { 'brand_id': i + 1, 'brand_name': brand_name, @@ -219,6 +254,7 @@ def generate_brands(self): brands.append(list(brand.values())) headers = ['brand_id', 'brand_name', 'brand_tier', 'created_at', 'description', 'is_active', 'parent_company', 'website_url', 'year_established'] +>>>>>>> origin/main self.save_to_csv('brands', brands, headers) return brands @@ -230,6 +266,19 @@ def generate_categories(self): # Use all main categories and create more subcategories for main_cat, subcats in FashionProvider.fashion_categories.items(): # Add main category +<<<<<<< HEAD + # Create main category data in database schema column order + main_category_data = [ + cat_id, # category_id + main_cat, # category_name + None, # parent_category_id + fake.text(max_nb_chars=150), # description + random.choice([True, False]), # is_seasonal + round(random.uniform(0.15, 0.45), 2), # typical_margin_percentage + self.timestamp_between('-3y', '-1y') # created_at + ] + categories.append(main_category_data) +======= main_category = { 'category_id': cat_id, 'category_name': main_cat, @@ -240,12 +289,29 @@ def generate_categories(self): 'typical_margin_percentage': round(random.uniform(0.15, 0.45), 2) } categories.append(list(main_category.values())) +>>>>>>> origin/main main_cat_id = cat_id cat_id += 1 # Add more subcategories for larger dataset num_subcats = min(len(subcats), random.randint(2, 4)) for subcat in random.sample(subcats, num_subcats): +<<<<<<< HEAD + # Create subcategory data in database schema column order + sub_category_data = [ + cat_id, # category_id + subcat, # category_name + main_cat_id, # parent_category_id + fake.text(max_nb_chars=100), # description + random.choice([True, False]), # is_seasonal + round(random.uniform(0.15, 0.45), 2), # typical_margin_percentage + self.timestamp_between('-3y', '-1y') # created_at + ] + categories.append(sub_category_data) + cat_id += 1 + + headers = ['category_id', 'category_name', 'parent_category_id', 'description', 'is_seasonal', 'typical_margin_percentage', 'created_at'] +======= sub_category = { 'category_id': cat_id, 'category_name': subcat, @@ -259,6 +325,7 @@ def generate_categories(self): cat_id += 1 headers = ['category_id', 'category_name', 'created_at', 'description', 'is_seasonal', 'parent_category_id', 'typical_margin_percentage'] +>>>>>>> origin/main self.save_to_csv('categories', categories, headers) return categories @@ -273,6 +340,31 @@ def generate_products(self): brand = random.choice(brands) category = random.choice(categories) +<<<<<<< HEAD + # Create product data in database schema column order + product_data = [ + f'P{i+1:07d}', # product_id + f"{fake.catch_phrase()} {category}", # product_name + brand, # brand + category, # category_l1 + random.choice(categories) if random.random() < 0.7 else None, # category_l2 + random.choice(categories) if random.random() < 0.3 else None, # category_l3 + self.weighted_choice(['Women', 'Men', 'Unisex'], [0.45, 0.35, 0.20]), # gender_target + random.choice(['Spring', 'Summer', 'Fall', 'Winter', 'All Season']), # season + random.choice(FashionProvider.fashion_materials), # material + random.choice(['Machine wash cold', 'Dry clean only', 'Hand wash', 'Machine wash warm']), # care_instructions + random.randint(0, 100), # sustainability_score + self.weighted_choice(['$30-50', '$50-100', '$100-150', '$150-200'], [0.3, 0.4, 0.2, 0.1]), # price_range + self.date_between('-2y', 'now'), # launch_date + True if random.random() < 0.9 else False, # is_active + self.timestamp_between('-2y', 'now') # created_at + ] + products.append(product_data) + + headers = ['product_id', 'product_name', 'brand', 'category_l1', 'category_l2', 'category_l3', + 'gender_target', 'season', 'material', 'care_instructions', 'sustainability_score', 'price_range', + 'launch_date', 'is_active', 'created_at'] +======= product = { 'product_id': f'P{i+1:07d}', 'product_name': f"{fake.catch_phrase()} {category}", @@ -295,6 +387,7 @@ def generate_products(self): headers = ['product_id', 'product_name', 'brand', 'category_l1', 'category_l2', 'category_l3', 'created_at', 'gender_target', 'is_active', 'launch_date', 'material', 'price_range', 'season', 'sustainability_score', 'care_instructions'] +>>>>>>> origin/main self.save_to_csv('products', products, headers) return products @@ -309,6 +402,26 @@ def generate_product_variants(self): num_variants = random.randint(3, 6) for _ in range(num_variants): +<<<<<<< HEAD + # Create variant data in database schema column order to match: + # variant_id, product_id, sku, size, color, additional_price, weight_oz, stock_quantity, created_at + variant_data = [ + variant_id, # variant_id + product_id, # product_id + f"SKU{product_id[1:]}V{variant_id:04d}", # sku + random.choice(FashionProvider.sizes), # size + random.choice(FashionProvider.colors), # color + round(random.uniform(0, 20), 2), # additional_price + round(random.uniform(2, 15), 2), # weight_oz + random.randint(10, 500), # stock_quantity + self.timestamp_between('-2y', 'now') # created_at + ] + variants.append(variant_data) + variant_id += 1 + + headers = ['variant_id', 'product_id', 'sku', 'size', 'color', 'additional_price', + 'weight_oz', 'stock_quantity', 'created_at'] +======= variant = { 'variant_id': variant_id, 'product_id': product_id, @@ -325,6 +438,7 @@ def generate_product_variants(self): headers = ['variant_id', 'product_id', 'sku', 'size', 'color', 'additional_price', 'stock_quantity', 'weight_oz', 'created_at'] +>>>>>>> origin/main self.save_to_csv('product_variants', variants, headers) return variants @@ -357,6 +471,34 @@ def generate_customers(self): dob = fake.date_between(date(birth_year, 1, 1), date(birth_year, 12, 31)) reg_date = self.timestamp_between('-3y', 'now') +<<<<<<< HEAD + # Create customer data in database schema column order to match: + # customer_id, email, phone, first_name, last_name, date_of_birth, gender, + # acquisition_channel, registration_date, account_status, preferred_language, + # marketing_consent, last_activity_date, created_at, updated_at + customer_data = [ + i + 1, # customer_id + fake.email(), # email + fake.phone_number()[:20], # phone + fake.first_name_male() if gender == 'male' else fake.first_name_female(), # first_name + fake.last_name(), # last_name + dob, # date_of_birth + gender, # gender + self.weighted_choice(list(channels.keys()), list(channels.values())), # acquisition_channel + reg_date, # registration_date + 'active' if random.random() < 0.90 else 'inactive', # account_status + 'en', # preferred_language (use 'en' instead of 'English' for consistency) + random.random() < 0.70, # marketing_consent + self.timestamp_between(reg_date, 'now'), # last_activity_date + reg_date, # created_at + self.timestamp_between(reg_date, 'now') # updated_at + ] + customers.append(customer_data) + + headers = ['customer_id', 'email', 'phone', 'first_name', 'last_name', 'date_of_birth', + 'gender', 'acquisition_channel', 'registration_date', 'account_status', + 'preferred_language', 'marketing_consent', 'last_activity_date', 'created_at', 'updated_at'] +======= customer = { 'customer_id': i + 1, 'first_name': fake.first_name_male() if gender == 'male' else fake.first_name_female(), @@ -379,6 +521,7 @@ def generate_customers(self): headers = ['customer_id', 'first_name', 'last_name', 'email', 'phone', 'date_of_birth', 'gender', 'registration_date', 'acquisition_channel', 'marketing_consent', 'account_status', 'preferred_language', 'created_at', 'updated_at', 'last_activity_date'] +>>>>>>> origin/main self.save_to_csv('customers', customers, headers) return customers @@ -394,6 +537,27 @@ def generate_customer_addresses(self): num_addresses = 1 if random.random() < 0.5 else 2 for j in range(num_addresses): +<<<<<<< HEAD + # Create address data in database schema column order to match: + # address_id, customer_id, address_type, street_address, city, state_province, + # postal_code, country, is_primary, created_at + address_data = [ + self.generate_id('ADDR'), # address_id + customer_id, # customer_id + 'primary' if j == 0 else random.choice(['billing', 'shipping', 'work']), # address_type + fake.street_address(), # street_address + fake.city(), # city + random.choice(states), # state_province + fake.postcode(), # postal_code + 'US', # country + j == 0, # is_primary + self.timestamp_between('-3y', 'now') # created_at + ] + addresses.append(address_data) + + headers = ['address_id', 'customer_id', 'address_type', 'street_address', 'city', + 'state_province', 'postal_code', 'country', 'is_primary', 'created_at'] +======= address = { 'address_id': self.generate_id('ADDR'), 'customer_id': customer_id, @@ -410,6 +574,7 @@ def generate_customer_addresses(self): headers = ['address_id', 'customer_id', 'address_type', 'is_primary', 'street_address', 'city', 'state_province', 'postal_code', 'country', 'created_at'] +>>>>>>> origin/main self.save_to_csv('customer_addresses', addresses, headers) return addresses @@ -429,6 +594,42 @@ def generate_orders(self): order_value = max(20, min(200, np.random.normal(60, 15))) order_date = self.timestamp_between('-2y', 'now') +<<<<<<< HEAD + # Create order data in database schema column order to match: + # order_id, customer_id, order_date, order_status, channel, store_location, + # subtotal_amount, tax_amount, shipping_amount, discount_amount, total_amount, + # currency, payment_method, shipping_method, promo_code, is_first_order, + # device_type, utm_source, utm_campaign, created_at, updated_at + order_data = [ + i + 1, # order_id + customer_id, # customer_id + order_date, # order_date + self.weighted_choice(list(status_dist.keys()), list(status_dist.values())), # order_status + random.choice(['web', 'mobile', 'store', 'phone']), # channel + fake.city() if random.random() < 0.1 else None, # store_location + round(order_value * 0.9, 2), # subtotal_amount + round(order_value * 0.08, 2), # tax_amount + round(random.uniform(0, 15), 2), # shipping_amount + round(random.uniform(0, order_value * 0.2), 2) if random.random() < 0.3 else 0, # discount_amount + round(order_value, 2), # total_amount + 'USD', # currency + self.weighted_choice(list(payment_methods.keys()), list(payment_methods.values())), # payment_method + random.choice(['standard', 'express', 'overnight', 'pickup']), # shipping_method + fake.lexify('PROMO????') if random.random() < 0.2 else None, # promo_code + random.random() < 0.2, # is_first_order + random.choice(['desktop', 'mobile', 'tablet']), # device_type + random.choice(['google', 'facebook', 'instagram', 'email', 'direct']) if random.random() < 0.6 else None, # utm_source + fake.lexify('campaign_???') if random.random() < 0.4 else None, # utm_campaign + order_date, # created_at + self.timestamp_between(order_date, 'now') # updated_at + ] + orders.append(order_data) + + headers = ['order_id', 'customer_id', 'order_date', 'order_status', 'channel', 'store_location', + 'subtotal_amount', 'tax_amount', 'shipping_amount', 'discount_amount', 'total_amount', + 'currency', 'payment_method', 'shipping_method', 'promo_code', 'is_first_order', + 'device_type', 'utm_source', 'utm_campaign', 'created_at', 'updated_at'] +======= order = { 'order_id': i + 1, 'customer_id': customer_id, @@ -459,6 +660,7 @@ def generate_orders(self): 'payment_method', 'currency', 'channel', 'device_type', 'shipping_method', 'is_first_order', 'promo_code', 'store_location', 'utm_source', 'utm_campaign', 'created_at', 'updated_at'] +>>>>>>> origin/main self.save_to_csv('orders', orders, headers) return orders @@ -473,7 +675,11 @@ def generate_order_items(self): for order in orders: order_id = order[0] +<<<<<<< HEAD + order_total = order[10] # total_amount is now at position 10 after column reordering +======= order_total = order[4] +>>>>>>> origin/main num_items = random.randint(1, 4) @@ -488,6 +694,31 @@ def generate_order_items(self): unit_price = round(order_total / num_items / quantity, 2) total_price = round(unit_price * quantity, 2) +<<<<<<< HEAD + # Create order item data in database schema column order to match: + # order_item_id, order_id, product_id, sku, quantity, unit_price, total_price, + # discount_applied, final_price, size, color, personalization, gift_wrap + item_data = [ + item_id, # order_item_id + order_id, # order_id + product_id, # product_id + variant[2], # sku + quantity, # quantity + unit_price, # unit_price + total_price, # total_price + round(random.uniform(0, total_price * 0.1), 2) if random.random() < 0.2 else 0, # discount_applied + total_price, # final_price + variant[3], # size + variant[4], # color + fake.sentence(nb_words=3) if random.random() < 0.02 else None, # personalization + random.random() < 0.05 # gift_wrap + ] + order_items.append(item_data) + item_id += 1 + + headers = ['order_item_id', 'order_id', 'product_id', 'sku', 'quantity', 'unit_price', 'total_price', + 'discount_applied', 'final_price', 'size', 'color', 'personalization', 'gift_wrap'] +======= item = { 'order_item_id': item_id, 'order_id': order_id, @@ -508,6 +739,7 @@ def generate_order_items(self): headers = ['order_item_id', 'order_id', 'product_id', 'sku', 'size', 'color', 'quantity', 'unit_price', 'total_price', 'final_price', 'discount_applied', 'gift_wrap', 'personalization'] +>>>>>>> origin/main self.save_to_csv('order_items', order_items, headers) return order_items @@ -526,6 +758,35 @@ def generate_returns(self): order_id = order[0] customer_id = order[1] order_date = order[2] +<<<<<<< HEAD + order_total = order[10] # total_amount is now at position 10 after column reordering + + return_date = self.timestamp_between(order_date, 'now') + + # Create return data in database schema column order to match: + # return_id, order_id, customer_id, return_date, return_reason, return_method, + # refund_amount, refund_method, return_status, restocking_fee, condition_received, processed_by, created_at + return_data = [ + self.generate_id('RET'), # return_id + order_id, # order_id + customer_id, # customer_id + return_date, # return_date + self.weighted_choice(list(return_reasons.keys()), list(return_reasons.values())), # return_reason + random.choice(['mail', 'store', 'pickup']), # return_method + round(order_total * random.uniform(0.5, 1.0), 2), # refund_amount + self.weighted_choice(list(refund_methods.keys()), list(refund_methods.values())), # refund_method + random.choice(['pending', 'approved', 'processed', 'completed']), # return_status + round(random.uniform(0, 15), 2), # restocking_fee + random.choice(['new', 'like_new', 'good', 'fair', 'poor']), # condition_received + fake.name(), # processed_by + return_date # created_at + ] + returns.append(return_data) + + headers = ['return_id', 'order_id', 'customer_id', 'return_date', 'return_reason', 'return_method', + 'refund_amount', 'refund_method', 'return_status', 'restocking_fee', + 'condition_received', 'processed_by', 'created_at'] +======= order_total = order[4] return_date = self.timestamp_between(order_date, 'now') @@ -550,6 +811,7 @@ def generate_returns(self): headers = ['return_id', 'order_id', 'customer_id', 'return_date', 'return_reason', 'return_status', 'refund_amount', 'refund_method', 'restocking_fee', 'return_method', 'condition_received', 'processed_by', 'created_at'] +>>>>>>> origin/main self.save_to_csv('returns', returns, headers) return returns @@ -575,6 +837,22 @@ def generate_return_items(self): for item in selected_items: order_item_id = item[0] +<<<<<<< HEAD + quantity = int(item[4]) # quantity is now at position 4 after column reordering, convert to int + + # Create return item data in database schema column order to match: + # return_item_id, return_id, order_item_id, quantity_returned, item_condition, refund_eligible, restockable + return_item_data = [ + self.generate_id('RI'), # return_item_id + return_id, # return_id + order_item_id, # order_item_id + min(quantity, random.randint(1, quantity)), # quantity_returned + self.weighted_choice(list(conditions.keys()), list(conditions.values())), # item_condition + random.random() < 0.9, # refund_eligible + random.random() < 0.8 # restockable + ] + return_items.append(return_item_data) +======= quantity = item[6] return_item = { @@ -587,6 +865,7 @@ def generate_return_items(self): 'restockable': random.random() < 0.8 } return_items.append(list(return_item.values())) +>>>>>>> origin/main headers = ['return_item_id', 'return_id', 'order_item_id', 'quantity_returned', 'item_condition', 'refund_eligible', 'restockable'] @@ -1069,11 +1348,549 @@ def generate_style_similarity_matches(self, num_matches=6000): 'recommendation_context'] self.save_to_csv('style_similarity_matches', matches, headers) return matches +<<<<<<< HEAD + def generate_campaigns(self, num_campaigns=25): + """Generate marketing campaigns""" + print(f"šŸ“¢ Generating {num_campaigns} marketing campaigns...") + + campaigns = [] + campaign_types = ['product_launch', 'promotional', 'seasonal', 'retention', 'brand_awareness'] + statuses = ['planned', 'active', 'paused', 'completed', 'cancelled'] + audiences = ['all_customers', 'women_fashion', 'men_fashion', 'young_adults', 'gift_buyers', 'loyalty_members', 'new_customers'] + + for i in range(num_campaigns): + campaign_type = random.choice(campaign_types) + start_date = self.timestamp_between('-6m', '+2m') + end_date = start_date + timedelta(days=random.randint(14, 90)) + budget_allocated = round(random.uniform(10000, 100000), 2) + budget_spent = round(budget_allocated * random.uniform(0.1, 1.2), 2) + + # Generate contextual campaign names + season = random.choice(['Spring', 'Summer', 'Fall', 'Winter', 'Holiday']) + if campaign_type == 'product_launch': + name = f"{season} Fashion Launch {random.randint(2023, 2024)}" + elif campaign_type == 'promotional': + discount = random.choice([20, 25, 30, 40, 50]) + name = f"{season} Sale {discount}% Off" + elif campaign_type == 'seasonal': + event = random.choice(['Back to School', 'Holiday Gift Guide', 'Summer Collection', 'Winter Essentials']) + name = f"{event} {random.randint(2023, 2024)}" + else: + name = f"{campaign_type.title()} Campaign {i+1}" + + campaign_data = [ + f'CAMP{i+1:03d}', # campaign_id + name, # campaign_name + campaign_type, # campaign_type + start_date, # start_date + end_date, # end_date + random.choice(audiences), # target_audience + budget_allocated, # budget_allocated + budget_spent, # budget_spent + random.randint(0, 50), # discount_offered + fake.lexify('????##') if random.random() < 0.7 else None, # promo_code + random.choice(['marketing_team', 'brand_team', 'digital_team']), # created_by + random.choice(statuses) # status + ] + campaigns.append(campaign_data) + + headers = ['campaign_id', 'campaign_name', 'campaign_type', 'start_date', 'end_date', + 'target_audience', 'budget_allocated', 'budget_spent', 'discount_offered', + 'promo_code', 'created_by', 'status'] + self.save_to_csv('campaigns', campaigns, headers) + return campaigns + + def generate_campaign_responses(self, num_responses=15000): + """Generate campaign response data""" + print(f"šŸ“§ Generating {num_responses} campaign responses...") + + if 'campaigns' not in self.data or 'customers' not in self.data: + print(" āš ļø Warning: campaigns and customers data required") + return [] + + campaigns = self.data['campaigns'] + customers = self.data['customers'] + responses = [] + statuses = ['sent', 'delivered', 'opened', 'clicked', 'converted', 'bounced', 'unsubscribed'] + + for i in range(num_responses): + campaign = random.choice(campaigns) + customer = random.choice(customers) + + # Create realistic conversion funnel + sent_date = self.timestamp_between('-3m', 'now') + delivered = random.random() < 0.95 + opened = random.random() < 0.25 if delivered else False + clicked = random.random() < 0.03 if opened else False + converted = random.random() < 0.15 if clicked else False + + delivered_date = sent_date + timedelta(minutes=random.randint(1, 60)) if delivered else None + opened_date = delivered_date + timedelta(hours=random.randint(1, 48)) if opened else None + clicked_date = opened_date + timedelta(minutes=random.randint(1, 30)) if clicked else None + converted_date = clicked_date + timedelta(minutes=random.randint(5, 120)) if converted else None + + response_data = [ + f'RESP{i+1:06d}', # response_id + campaign[0], # campaign_id + customer[0], # customer_id + sent_date, # sent_date + delivered_date, # delivered_date + opened_date, # opened_date + clicked_date, # clicked_date + converted_date, # converted_date + f'ORDER{random.randint(1, 150000)}' if converted else None, # conversion_order_id + round(random.uniform(25, 300), 2) if converted else 0, # conversion_amount + 'converted' if converted else ('clicked' if clicked else ('opened' if opened else ('delivered' if delivered else 'sent'))) # response_status + ] + responses.append(response_data) + + headers = ['response_id', 'campaign_id', 'customer_id', 'sent_date', 'delivered_date', + 'opened_date', 'clicked_date', 'converted_date', 'conversion_order_id', + 'conversion_amount', 'response_status'] + self.save_to_csv('campaign_responses', responses, headers) + return responses + + def generate_customer_service_interactions(self, num_interactions=8000): + """Generate customer service interaction data""" + print(f"šŸŽ§ Generating {num_interactions} customer service interactions...") + + if 'customers' not in self.data: + print(" āš ļø Warning: customers data required") + return [] + + customers = self.data['customers'] + interactions = [] + channels = ['phone', 'email', 'chat', 'social_media', 'in_store'] + interaction_types = ['complaint', 'inquiry', 'return_request', 'order_status', 'product_info', 'technical_support'] + categories = ['order_issues', 'product_quality', 'shipping', 'payment', 'returns', 'account', 'general_inquiry'] + priorities = ['low', 'medium', 'high', 'urgent'] + resolution_types = ['resolved', 'escalated', 'pending', 'closed_no_resolution'] + agents = ['agent_001', 'agent_002', 'agent_003', 'agent_004', 'agent_005', 'agent_006', 'agent_007', 'agent_008'] + + for i in range(num_interactions): + customer = random.choice(customers) + interaction_date = self.timestamp_between('-6m', 'now') + + channel = random.choice(channels) + interaction_type = random.choice(interaction_types) + category = random.choice(categories) + priority = random.choice(priorities) + + # Generate realistic conversation content + subjects = { + 'complaint': f"Issue with {random.choice(['order delivery', 'product quality', 'billing', 'website'])}", + 'inquiry': f"Question about {random.choice(['product availability', 'sizing', 'return policy', 'shipping'])}", + 'return_request': f"Return request for order #{random.randint(1, 150000)}", + 'order_status': f"Status inquiry for order #{random.randint(1, 150000)}", + 'product_info': f"Information needed about {fake.catch_phrase()} product", + 'technical_support': f"Technical issue with {random.choice(['website', 'mobile app', 'payment', 'account'])}" + } + + subject_line = subjects.get(interaction_type, "General inquiry") + conversation_text = f"Customer contacted via {channel} regarding {subject_line.lower()}. {fake.text(max_nb_chars=200)}" + + # Generate realistic metrics + satisfaction_score = random.randint(1, 5) if random.random() < 0.8 else None + resolution_time = round(random.uniform(0.5, 48.0), 1) + escalated = random.random() < 0.1 + sentiment_score = round(random.uniform(-1, 1), 2) + + interaction_data = [ + f'INT{i+1:06d}', # interaction_id + customer[0], # customer_id + interaction_date, # interaction_date + channel, # channel + interaction_type, # interaction_type + category, # category + priority, # priority + subject_line, # subject_line + conversation_text, # conversation_text + f'ORDER{random.randint(1, 150000)}' if random.random() < 0.4 else None, # related_order_id + random.choice(resolution_types), # resolution_type + satisfaction_score, # satisfaction_score + resolution_time, # resolution_time_hours + random.choice(agents), # agent_id + escalated, # escalated + sentiment_score, # sentiment_score + ', '.join(fake.words(nb=random.randint(2, 5))), # keywords_extracted + random.random() < 0.15, # follow_up_needed + round(random.uniform(15, 85), 2) # interaction_cost + ] + interactions.append(interaction_data) + + headers = ['interaction_id', 'customer_id', 'interaction_date', 'channel', 'interaction_type', + 'category', 'priority', 'subject_line', 'conversation_text', 'related_order_id', + 'resolution_type', 'satisfaction_score', 'resolution_time_hours', 'agent_id', + 'escalated', 'sentiment_score', 'keywords_extracted', 'follow_up_needed', 'interaction_cost'] + self.save_to_csv('customer_service_interactions', interactions, headers) + return interactions + + def generate_email_engagement(self, num_engagements=20000): + """Generate email engagement metrics""" + print(f"šŸ“¬ Generating {num_engagements} email engagements...") + + if 'customers' not in self.data or 'campaigns' not in self.data: + print(" āš ļø Warning: customers and campaigns data required") + return [] + + customers = self.data['customers'] + campaigns = self.data['campaigns'] + engagements = [] + delivery_statuses = ['delivered', 'bounced', 'blocked', 'deferred'] + device_types = ['desktop', 'mobile', 'tablet'] + email_clients = ['Gmail', 'Outlook', 'Apple Mail', 'Yahoo Mail', 'Other'] + + for i in range(num_engagements): + customer = random.choice(customers) + campaign = random.choice(campaigns) + + send_timestamp = self.timestamp_between('-6m', 'now') + delivery_status = self.weighted_choice(delivery_statuses, [0.85, 0.10, 0.03, 0.02]) + + # Create realistic engagement funnel + delivered = delivery_status == 'delivered' + opened = random.random() < 0.22 if delivered else False + clicked = random.random() < 0.035 if opened else False + + open_timestamp = send_timestamp + timedelta(hours=random.randint(1, 72)) if opened else None + click_timestamp = open_timestamp + timedelta(minutes=random.randint(1, 30)) if clicked else None + + total_opens = random.randint(1, 5) if opened else 0 + total_clicks = random.randint(1, 3) if clicked else 0 + + engagement_data = [ + f'ENG{i+1:06d}', # engagement_id + customer[0], # customer_id + campaign[0], # campaign_id + campaign[1][:100], # email_subject (truncated) + send_timestamp, # send_timestamp + delivery_status, # delivery_status + open_timestamp, # open_timestamp + total_opens, # total_opens + click_timestamp, # click_timestamp + total_clicks, # total_clicks + ', '.join(fake.words(nb=random.randint(1, 3))) if clicked else None, # links_clicked + random.randint(5, 180) if opened else 0, # time_spent_reading + random.choice(device_types) if opened else None, # device_type + random.choice(email_clients) if opened else None, # email_client + send_timestamp + timedelta(days=random.randint(1, 30)) if random.random() < 0.02 else None, # unsubscribe_timestamp + random.randint(0, 2) if random.random() < 0.01 else 0, # forward_count + random.choice(['A', 'B']) if random.random() < 0.3 else None # ab_test_variant + ] + engagements.append(engagement_data) + + headers = ['engagement_id', 'customer_id', 'campaign_id', 'email_subject', 'send_timestamp', + 'delivery_status', 'open_timestamp', 'total_opens', 'click_timestamp', 'total_clicks', + 'links_clicked', 'time_spent_reading', 'device_type', 'email_client', + 'unsubscribe_timestamp', 'forward_count', 'ab_test_variant'] + self.save_to_csv('email_engagement', engagements, headers) + return engagements + + def generate_loyalty_profiles(self, coverage_rate=0.6): + """Generate loyalty program profiles""" + if 'customers' not in self.data: + print(" āš ļø Warning: customers data required") + return [] + + customers = self.data['customers'] + num_profiles = int(len(customers) * coverage_rate) + print(f"šŸŽ–ļø Generating {num_profiles} loyalty profiles...") + + profiles = [] + tiers = ['Bronze', 'Silver', 'Gold', 'Platinum', 'Diamond'] + tier_weights = [0.40, 0.30, 0.20, 0.08, 0.02] + categories = ['Dresses', 'Tops', 'Shoes', 'Accessories', 'Outerwear', 'Activewear'] + + selected_customers = random.sample(customers, num_profiles) + + for customer in selected_customers: + customer_id = customer[0] + reg_date = customer[8] # registration_date + birth_date = customer[5] # date_of_birth + + tier = self.weighted_choice(tiers, tier_weights) + points_lifetime = random.randint(100, 50000) + points_balance = int(points_lifetime * random.uniform(0.1, 0.8)) + + # Tier thresholds + tier_thresholds = {'Bronze': 1000, 'Silver': 5000, 'Gold': 15000, 'Platinum': 35000, 'Diamond': 75000} + next_tier_idx = tiers.index(tier) + 1 + next_threshold = tier_thresholds.get(tiers[next_tier_idx]) if next_tier_idx < len(tiers) else None + + tier_start_date = reg_date + timedelta(days=random.randint(30, 365)) + anniversary_date = reg_date + birthday_month = birth_date.month if birth_date else random.randint(1, 12) + + profile_data = [ + customer_id, # customer_id + tier, # tier_level + points_balance, # points_balance + points_lifetime, # points_lifetime + tier_start_date.date(), # tier_start_date + next_threshold, # next_tier_threshold + birthday_month, # birthday_month + anniversary_date.date(), # anniversary_date + ', '.join(random.sample(categories, random.randint(1, 3))), # preferred_categories + random.choice(['email_sms', 'email_only', 'sms_only', 'app_notifications']), # communication_preferences + tier in ['Platinum', 'Diamond'] or random.random() < 0.05, # vip_status + fake.date_between('-1y', 'now') # last_tier_review + ] + profiles.append(profile_data) + + headers = ['customer_id', 'tier_level', 'points_balance', 'points_lifetime', 'tier_start_date', + 'next_tier_threshold', 'birthday_month', 'anniversary_date', 'preferred_categories', + 'communication_preferences', 'vip_status', 'last_tier_review'] + self.save_to_csv('loyalty_profiles', profiles, headers) + return profiles + + def generate_loyalty_activities(self, num_activities=35000): + """Generate loyalty program activities""" + print(f"šŸ† Generating {num_activities} loyalty activities...") + + if 'loyalty_profiles' not in self.data: + print(" āš ļø Warning: loyalty_profiles data required") + return [] + + profiles = self.data['loyalty_profiles'] + activities = [] + activity_types = ['purchase_points', 'bonus_points', 'referral_points', 'review_points', + 'birthday_bonus', 'redemption', 'tier_bonus', 'social_share_points'] + + for i in range(num_activities): + profile = random.choice(profiles) + customer_id = profile[0] + + activity_type = random.choice(activity_types) + activity_date = self.timestamp_between('-2y', 'now') + + # Generate appropriate point values + if activity_type == 'purchase_points': + points_earned = random.randint(10, 500) + points_redeemed = 0 + description = f"Points earned from purchase (${random.randint(25, 1250)})" + related_order_id = f"ORDER{random.randint(1, 150000)}" + elif activity_type == 'redemption': + points_earned = 0 + points_redeemed = random.randint(100, 2000) + description = f"Points redeemed for {random.choice(['discount', 'free shipping', 'product reward'])}" + related_order_id = f"ORDER{random.randint(1, 150000)}" if random.random() < 0.7 else None + elif activity_type == 'bonus_points': + points_earned = random.randint(50, 1000) + points_redeemed = 0 + description = f"Bonus points from {random.choice(['double points event', 'seasonal promotion', 'tier upgrade'])}" + related_order_id = None + else: + points_earned = random.randint(5, 100) + points_redeemed = 0 + description = f"Points from {activity_type.replace('_', ' ')}" + related_order_id = None + + expiry_date = activity_date.date() + timedelta(days=random.randint(365, 730)) + + activity_data = [ + f'ACT{i+1:06d}', # activity_id + customer_id, # customer_id + activity_type, # activity_type + activity_date, # activity_date + points_earned, # points_earned + points_redeemed, # points_redeemed + description, # description + related_order_id, # related_order_id + expiry_date # expiry_date + ] + activities.append(activity_data) + + headers = ['activity_id', 'customer_id', 'activity_type', 'activity_date', + 'points_earned', 'points_redeemed', 'description', 'related_order_id', 'expiry_date'] + self.save_to_csv('loyalty_activities', activities, headers) + return activities + + def generate_product_embeddings(self, coverage_rate=0.8): + """Generate product embeddings for ML similarity""" + if 'products' not in self.data: + print(" āš ļø Warning: products data required") + return [] + + products = self.data['products'] + num_embeddings = int(len(products) * coverage_rate) + print(f"🧠 Generating {num_embeddings} product embeddings...") + + embeddings = [] + models = ['sentence-transformers/all-MiniLM-L6-v2', 'openai/text-embedding-ada-002', 'cohere-embed-multilingual-v3.0'] + + selected_products = random.sample(products, num_embeddings) + + for i, product in enumerate(selected_products): + product_id = product[0] + product_name = product[1] + + # Generate fake embedding vector (in reality this would be computed) + vector_size = random.choice([384, 768, 1536]) # Common embedding dimensions + fake_vector = [round(random.uniform(-1, 1), 6) for _ in range(vector_size)] + vector_text = str(fake_vector) + + text_source = f"{product_name} - {product[2]} - {product[3]}" # name, brand, category + + embedding_data = [ + f'EMB_P{i+1:06d}', # embedding_id + product_id, # product_id + text_source, # text_source + vector_text, # embedding_vector + random.choice(models), # embedding_model + self.timestamp_between('-6m', 'now'), # created_date + random.randint(1, 3) # vector_version + ] + embeddings.append(embedding_data) + + headers = ['embedding_id', 'product_id', 'text_source', 'embedding_vector', + 'embedding_model', 'created_date', 'vector_version'] + self.save_to_csv('product_embeddings', embeddings, headers) + return embeddings + + def generate_customer_style_embeddings(self, coverage_rate=0.4): + """Generate customer style embeddings""" + if 'customers' not in self.data: + print(" āš ļø Warning: customers data required") + return [] + + customers = self.data['customers'] + num_embeddings = int(len(customers) * coverage_rate) + print(f"šŸ‘¤ Generating {num_embeddings} customer style embeddings...") + + embeddings = [] + selected_customers = random.sample(customers, num_embeddings) + + for i, customer in enumerate(selected_customers): + customer_id = customer[0] + + # Generate style preferences text + styles = ['casual', 'formal', 'trendy', 'classic', 'bohemian', 'minimalist', 'edgy', 'romantic'] + colors = ['neutral tones', 'bright colors', 'earth tones', 'pastels', 'monochrome'] + materials = ['cotton', 'silk', 'wool', 'denim', 'leather'] + + style_prefs = f"Prefers {random.choice(styles)} style with {random.choice(colors)} and {random.choice(materials)} materials" + + # Generate fake embedding vector + vector_size = random.choice([384, 768]) + fake_vector = [round(random.uniform(-1, 1), 6) for _ in range(vector_size)] + vector_text = str(fake_vector) + + embedding_data = [ + f'EMB_C{i+1:06d}', # customer_embedding_id + str(customer_id), # customer_id (as string per schema) + vector_text, # style_vector + style_prefs, # text_source + round(random.uniform(0.6, 1.0), 4), # confidence_score + self.timestamp_between('-3m', 'now'), # last_updated + random.randint(5, 50) # purchase_count_basis + ] + embeddings.append(embedding_data) + + headers = ['customer_embedding_id', 'customer_id', 'style_vector', 'text_source', + 'confidence_score', 'last_updated', 'purchase_count_basis'] + self.save_to_csv('customer_style_embeddings', embeddings, headers) + return embeddings + + def generate_fashion_research_documents(self, num_documents=150): + """Generate fashion research documents""" + print(f"šŸ“š Generating {num_documents} fashion research documents...") + + documents = [] + doc_types = ['trend_report', 'market_analysis', 'consumer_survey', 'style_guide', 'seasonal_forecast'] + sources = ['Fashion Institute', 'Trend Analytics Corp', 'Style Research Lab', 'Market Insights Inc', 'Fashion Forward Think Tank'] + themes = ['sustainable fashion', 'digital transformation', 'color trends', 'material innovation', 'consumer behavior', + 'social media influence', 'seasonal preferences', 'demographic shifts', 'price sensitivity', 'brand loyalty'] + segments = ['Gen Z', 'Millennials', 'Gen X', 'All Demographics', 'Urban Professionals', 'Students', 'Premium Shoppers'] + regions = ['North America', 'Europe', 'Asia-Pacific', 'Global', 'United States', 'Canada'] + + for i in range(num_documents): + doc_type = random.choice(doc_types) + source = random.choice(sources) + + # Generate contextual titles + if doc_type == 'trend_report': + title = f"{random.choice(['Spring', 'Summer', 'Fall', 'Winter'])} {random.randint(2023, 2024)} Fashion Trends Report" + elif doc_type == 'market_analysis': + title = f"{random.choice(['Quarterly', 'Annual'])} Fashion Market Analysis - {random.choice(regions)}" + else: + title = f"{doc_type.title().replace('_', ' ')} - {fake.catch_phrase()}" + + # Generate embedding vector + vector_size = 768 + fake_vector = [round(random.uniform(-1, 1), 6) for _ in range(vector_size)] + + document_data = [ + f'DOC_{i+1:06d}', # document_id + doc_type, # document_type + source, # source + title, # title + fake.date_between('-2y', 'now'), # publication_date + str(fake_vector), # embedding_vector + ', '.join(random.sample(themes, random.randint(2, 4))), # key_themes + ', '.join(random.sample(segments, random.randint(1, 3))), # customer_segments + round(random.uniform(0.6, 0.95), 2), # trend_confidence + random.choice(regions), # geographic_focus + self.timestamp_between('-2y', 'now') # created_at + ] + documents.append(document_data) + + headers = ['document_id', 'document_type', 'source', 'title', 'publication_date', + 'embedding_vector', 'key_themes', 'customer_segments', 'trend_confidence', + 'geographic_focus', 'created_at'] + self.save_to_csv('fashion_research_documents', documents, headers) + return documents + + def generate_session_summary(self, num_days=90): + """Generate daily session summary data""" + print(f"šŸ“ˆ Generating {num_days} days of session summaries...") + + summaries = [] + start_date = date.today() - timedelta(days=num_days) + + for i in range(num_days): + summary_date = start_date + timedelta(days=i) + + # Generate realistic daily metrics + is_weekend = summary_date.weekday() >= 5 + is_holiday = random.random() < 0.05 # 5% chance of holiday + + base_sessions = random.randint(800, 1500) + if is_weekend: + base_sessions = int(base_sessions * random.uniform(0.7, 0.9)) + if is_holiday: + base_sessions = int(base_sessions * random.uniform(0.4, 0.8)) + + total_sessions = base_sessions + unique_visitors = int(total_sessions * random.uniform(0.75, 0.85)) + bounce_sessions = int(total_sessions * random.uniform(0.35, 0.55)) + avg_duration = random.randint(180, 600) # 3-10 minutes in seconds + + summary_data = [ + summary_date, # summary_date + total_sessions, # total_sessions + unique_visitors, # unique_visitors + bounce_sessions, # bounce_sessions + avg_duration, # avg_session_duration + self.timestamp_between('now', 'now') # created_at + ] + summaries.append(summary_data) + + headers = ['summary_date', 'total_sessions', 'unique_visitors', 'bounce_sessions', + 'avg_session_duration', 'created_at'] + self.save_to_csv('session_summary', summaries, headers) + return summaries + + def generate_all_data(self): + """Generate complete dataset including all 26 tables for enterprise C360 system""" + print("\nšŸš€ Starting Complete C360 Data Generation...") + print("šŸ”— Generating 50k customer dataset with enterprise-scale data + all auxiliary tables") +======= def generate_all_data(self): """Generate complete dataset including reviews""" print("\nšŸš€ Starting Complete C360 Data Generation...") print("šŸ”— Generating 50k customer dataset with enterprise-scale data + reviews") +>>>>>>> origin/main # Master data first print("\n=== šŸ“¦ Generating Master Data ===") @@ -1102,22 +1919,67 @@ def generate_all_data(self): print("\n=== šŸ“± Generating Social Media & Digital Analytics ===") self.generate_social_mentions() self.generate_website_sessions() +<<<<<<< HEAD + self.generate_session_summary() + + # Marketing and Campaigns + print("\n=== šŸ“¢ Generating Marketing & Campaign Data ===") + self.generate_campaigns() + self.generate_campaign_responses() + self.generate_email_engagement() + + # Customer Service + print("\n=== šŸŽ§ Generating Customer Service Data ===") + self.generate_customer_service_interactions() + + # Loyalty Program + print("\n=== šŸŽ–ļø Generating Loyalty Program Data ===") + self.generate_loyalty_profiles() + self.generate_loyalty_activities() + + # ML/AI Data and Recommendations + print("\n=== šŸŽÆ Generating ML & Recommendation Data ===") + self.generate_style_similarity_matches() + self.generate_product_embeddings() + self.generate_customer_style_embeddings() + + # Research and Insights + print("\n=== šŸ“š Generating Research & Insights Data ===") + self.generate_fashion_research_documents() + + print(f"\nšŸŽ‰ Complete enterprise dataset generation finished!") +======= # Style Similarity Matches (ML/Recommendation Engine Data) print("\n=== šŸŽÆ Generating ML & Recommendation Data ===") self.generate_style_similarity_matches() print(f"\nšŸŽ‰ Complete dataset generation finished!") +>>>>>>> origin/main print(f"\nšŸ“Š Generated tables:") for table, data in self.data.items(): print(f" āœ… {table}: {len(data):,} rows") +<<<<<<< HEAD + total_rows = sum(len(data) for data in self.data.values()) + print(f"\nšŸ”— Enterprise Scale Achieved:") + print(f" āœ… 50,000 customers with proportional transactions") + print(f" āœ… {total_rows:,} total records across {len(self.data)} tables") + print(f" āœ… Complete 26-table C360 Fashion Retail database") + print(f" āœ… Contextually accurate reviews with quality correlation") + print(f" āœ… Foreign key compatible data types") + print(f" āœ… Complete referential integrity maintained") + print(f" āœ… ML/AI ready embeddings and recommendations") + print(f" āœ… Enterprise customer service and loyalty tracking") + print(f" āœ… Comprehensive marketing campaign analytics") +======= print(f"\nšŸ”— Enterprise Scale Achieved:") print(f" āœ… 50,000 customers with proportional transactions") print(f" āœ… ~700k total records across all tables") print(f" āœ… Contextually accurate reviews with quality correlation") print(f" āœ… Foreign key compatible data types") print(f" āœ… Complete referential integrity maintained") +>>>>>>> origin/main print(f"\nšŸš€ Ready for database loading!") if __name__ == "__main__": diff --git a/.data/c360-fashion-retail/postgres/brands.csv b/.data/c360-fashion-retail/postgres/brands.csv index 70cb52bd..1262216d 100644 --- a/.data/c360-fashion-retail/postgres/brands.csv +++ b/.data/c360-fashion-retail/postgres/brands.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:adbf224ab0f0706ddc88998785a9e1c1098d09d997e43fcdcd31b5291c16cf9b -size 11275 +oid sha256:5f5007c48ba1a6131538ff94f9e96ea9f569c0458fba66f3f297b5d89e6327ac +size 12446 diff --git a/.data/c360-fashion-retail/postgres/campaign_responses.csv b/.data/c360-fashion-retail/postgres/campaign_responses.csv index 6c72b3a1..6380b2a8 100644 --- a/.data/c360-fashion-retail/postgres/campaign_responses.csv +++ b/.data/c360-fashion-retail/postgres/campaign_responses.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:150a056f6f213d3d59cbb50e3175066eef424f91dd85d32f93228a9647bde330 -size 155 +oid sha256:3ea20dfed0dfee875a201e8685702cd6024ee80a288b82bc7d5548a46b4f7223 +size 1500182 diff --git a/.data/c360-fashion-retail/postgres/campaigns.csv b/.data/c360-fashion-retail/postgres/campaigns.csv index 9c84b2c8..dc639f38 100644 --- a/.data/c360-fashion-retail/postgres/campaigns.csv +++ b/.data/c360-fashion-retail/postgres/campaigns.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7a553972bd2b7a61516573fe52cf43bd125b7ef850259ea6055da25ea58f68dd -size 875 +oid sha256:9881ae95bb843a056fbf979d98f22929aaee40f80c58a4fcc89733f32866e3b3 +size 4170 diff --git a/.data/c360-fashion-retail/postgres/categories.csv b/.data/c360-fashion-retail/postgres/categories.csv index 5a2a7da2..36dc6bd4 100644 --- a/.data/c360-fashion-retail/postgres/categories.csv +++ b/.data/c360-fashion-retail/postgres/categories.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f269dc538b94f028760cadfbcb7a9f2b680e6758c395cc8ab76f71d31bb1d01d -size 1456 +oid sha256:122f38375b31bc25b16eb8f2ca2d63d86716689cf2ee92ef05c40b7ecf29e41c +size 2559 diff --git a/.data/c360-fashion-retail/postgres/customer_addresses.csv b/.data/c360-fashion-retail/postgres/customer_addresses.csv index cdfb1169..32a84a90 100644 --- a/.data/c360-fashion-retail/postgres/customer_addresses.csv +++ b/.data/c360-fashion-retail/postgres/customer_addresses.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:350200c919d7412e5d656461aa0fc0d09060abdf2a9f224ae412c0c41dc2facc -size 7287596 +oid sha256:5dbc2afbe05bb348c69a1855220b71f259a5f3cbcef30ed3eddbc6f0423d8c8b +size 8133783 diff --git a/.data/c360-fashion-retail/postgres/customer_service_interactions.csv b/.data/c360-fashion-retail/postgres/customer_service_interactions.csv index ca3caeb6..ab1afed5 100644 --- a/.data/c360-fashion-retail/postgres/customer_service_interactions.csv +++ b/.data/c360-fashion-retail/postgres/customer_service_interactions.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dbf7b547b859afa0a8e4039f862d24eb6c54633cfb0a00f2b9d1002a58f2a2b8 -size 280 +oid sha256:a00044241c2e4870e6d82d01557f62378072fbc511dcd4a0d346cba08043fa49 +size 3416263 diff --git a/.data/c360-fashion-retail/postgres/customer_style_embeddings.csv b/.data/c360-fashion-retail/postgres/customer_style_embeddings.csv index f6325458..7dd6796e 100644 --- a/.data/c360-fashion-retail/postgres/customer_style_embeddings.csv +++ b/.data/c360-fashion-retail/postgres/customer_style_embeddings.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:428f7dba0ca8e626909af250ee98274c6e9c7de028b25bc354880f88d76eeccd -size 110 +oid sha256:874968ab4bf6965cc6b8cffe874eabbbdb5497c714145c1f873148d9c241c19e +size 122238796 diff --git a/.data/c360-fashion-retail/postgres/customers.csv b/.data/c360-fashion-retail/postgres/customers.csv index ef26e1cb..74f01d19 100644 --- a/.data/c360-fashion-retail/postgres/customers.csv +++ b/.data/c360-fashion-retail/postgres/customers.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a72960511dd33016806d26493f9be0671d500083bd491bbc2215db7638f3d547 -size 9102911 +oid sha256:bfd179902d4a3342c07765165c4b9bb5294d0970bfcd35f5a6d5403a91515c54 +size 10466781 diff --git a/.data/c360-fashion-retail/postgres/email_engagement.csv b/.data/c360-fashion-retail/postgres/email_engagement.csv index 1fd4095d..1c1c7275 100644 --- a/.data/c360-fashion-retail/postgres/email_engagement.csv +++ b/.data/c360-fashion-retail/postgres/email_engagement.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:03cad2e09db3e1368d0bbefb088e0dc660fc3dfa208aeed8f01e8e2e95c4d23c -size 249 +oid sha256:70c849bbefc229f1600b2752657471b5e92ff5ffb916c966f6d18d75033a21dd +size 2176340 diff --git a/.data/c360-fashion-retail/postgres/fashion_research_documents.csv b/.data/c360-fashion-retail/postgres/fashion_research_documents.csv index e2a2f7fe..a3f2ab81 100644 --- a/.data/c360-fashion-retail/postgres/fashion_research_documents.csv +++ b/.data/c360-fashion-retail/postgres/fashion_research_documents.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:00f7609631a58aacd813e472b308bd898297b6870933a35139e2180c924f173b -size 147 +oid sha256:d9f28a80fbb6caec7352d33d35d8791c20a603ce41e8a2ae7d9b7fa3c3864f54 +size 1232634 diff --git a/.data/c360-fashion-retail/postgres/loyalty_activities.csv b/.data/c360-fashion-retail/postgres/loyalty_activities.csv index 9f0cc856..ab43413f 100644 --- a/.data/c360-fashion-retail/postgres/loyalty_activities.csv +++ b/.data/c360-fashion-retail/postgres/loyalty_activities.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5883b56df7364b980be57b4a6a7cba171daeccbe23a74e44bf9e2e2b644fcf19 -size 123 +oid sha256:c7e58c055bb9b04e0ed85bcbbbc4132a65f1f04f71cc879bd908b86ead17e724 +size 3770027 diff --git a/.data/c360-fashion-retail/postgres/loyalty_profiles.csv b/.data/c360-fashion-retail/postgres/loyalty_profiles.csv index 7743b927..6cece98a 100644 --- a/.data/c360-fashion-retail/postgres/loyalty_profiles.csv +++ b/.data/c360-fashion-retail/postgres/loyalty_profiles.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d110018df18ecb2dbab912bc0c57fa09c07ec0a0c7d97503f0f45bf7c56579a2 -size 197 +oid sha256:7c3e501d59e0c256804eec8a2ad1832527f8df3ae605393550941277861831fa +size 3087265 diff --git a/.data/c360-fashion-retail/postgres/order_items.csv b/.data/c360-fashion-retail/postgres/order_items.csv index 06c6cfac..08f9f62d 100644 --- a/.data/c360-fashion-retail/postgres/order_items.csv +++ b/.data/c360-fashion-retail/postgres/order_items.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b49414728d698a9eb6bde16dc8ded80e8e8e3d9f907689ef9e8b91e577743ee -size 22102933 +oid sha256:04725dcfda0a515b411707e7a02fda588980da70560515f81574052343b243a0 +size 27576386 diff --git a/.data/c360-fashion-retail/postgres/orders.csv b/.data/c360-fashion-retail/postgres/orders.csv index f542c95d..9beb248f 100644 --- a/.data/c360-fashion-retail/postgres/orders.csv +++ b/.data/c360-fashion-retail/postgres/orders.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2772b1eb1e6f854c86b55e0a440695a670bf5cd16581b926a92fa42eb60d23b8 -size 20040014 +oid sha256:444f3f2b7299df3a1a62f6dee87dd71eac5b9ec6dd9ed03855d6ed86436eef2e +size 26883059 diff --git a/.data/c360-fashion-retail/postgres/product_embeddings.csv b/.data/c360-fashion-retail/postgres/product_embeddings.csv index 216fe2d6..39f5e290 100644 --- a/.data/c360-fashion-retail/postgres/product_embeddings.csv +++ b/.data/c360-fashion-retail/postgres/product_embeddings.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9ad86dbee1884a58565ba5afc174e2393d98f2a7faebceae92c2d83a9a19a0b7 -size 97 +oid sha256:97330cb2515a6544ea7383a4782cf81d87797c0315ea674266210e17a12f2045 +size 4587626 diff --git a/.data/c360-fashion-retail/postgres/product_variants.csv b/.data/c360-fashion-retail/postgres/product_variants.csv index cc84a077..5d003f4c 100644 --- a/.data/c360-fashion-retail/postgres/product_variants.csv +++ b/.data/c360-fashion-retail/postgres/product_variants.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:85b6a4ac9e770a87612110a0089d97df9e741c89b4c01e08dc2813f702a72518 -size 222497 +oid sha256:e83869932be9381a70fb0af970f0c1aa4fe17cb74e60402bcdfbef8093109e5a +size 218384 diff --git a/.data/c360-fashion-retail/postgres/products.csv b/.data/c360-fashion-retail/postgres/products.csv index 796d2fbd..3d0c4490 100644 --- a/.data/c360-fashion-retail/postgres/products.csv +++ b/.data/c360-fashion-retail/postgres/products.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c2dc84e48618e5482d3728d7d2316a7eca50a2977a9029a72b513aa5b05162b5 -size 116641 +oid sha256:aa7a193a71238062f511c1598ccb7459e1fc3f6f22f736a38b7ece1792ee03c3 +size 105424 diff --git a/.data/c360-fashion-retail/postgres/return_items.csv b/.data/c360-fashion-retail/postgres/return_items.csv index 0424c461..dc9d7203 100644 --- a/.data/c360-fashion-retail/postgres/return_items.csv +++ b/.data/c360-fashion-retail/postgres/return_items.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e8616873de5a15755655482202ad686ac3e868c4e9cb03c0d86f005c42fdc158 -size 795126 +oid sha256:17947e7a11cde89156a51630c9c57150b1bee0d783f7103ce81939afe8b32544 +size 1043447 diff --git a/.data/c360-fashion-retail/postgres/returns.csv b/.data/c360-fashion-retail/postgres/returns.csv index 66619558..6d0edabd 100644 --- a/.data/c360-fashion-retail/postgres/returns.csv +++ b/.data/c360-fashion-retail/postgres/returns.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8e43c487e93dd665d7a642647ad58aa821a3a47b244bb16f0046043ddeeb2aa6 -size 1754023 +oid sha256:6d0b86a3756fdbf60c1e0f3d00eead5bbd08ce5648f2f6302c95ddbe2610985e +size 2181273 diff --git a/.data/c360-fashion-retail/postgres/reviews.csv b/.data/c360-fashion-retail/postgres/reviews.csv index 659d1580..62518fcf 100644 --- a/.data/c360-fashion-retail/postgres/reviews.csv +++ b/.data/c360-fashion-retail/postgres/reviews.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c94a7aadbcdebf709309695787dd6704bbd02d06f7b359cc8d3cba1f05b0fc09 -size 14580389 +oid sha256:ec26edbc7fca5c0445188b6f6d40e1c5e5394d8b5a62b7966ad9739608d7d85f +size 18071472 diff --git a/.data/c360-fashion-retail/postgres/session_summary.csv b/.data/c360-fashion-retail/postgres/session_summary.csv index d944ef7e..213af5bf 100644 --- a/.data/c360-fashion-retail/postgres/session_summary.csv +++ b/.data/c360-fashion-retail/postgres/session_summary.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd40d944269ddf8c1724ba1990529dfb73d7cf42ea2cf29332a5f70a0545d3da -size 811 +oid sha256:d909b294bbdd581d2534eb0da86f120c10ae2840338e531e87451266b2046a4f +size 5122 diff --git a/.data/c360-fashion-retail/postgres/social_mentions.csv b/.data/c360-fashion-retail/postgres/social_mentions.csv index a90ec0be..e610f5a5 100644 --- a/.data/c360-fashion-retail/postgres/social_mentions.csv +++ b/.data/c360-fashion-retail/postgres/social_mentions.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8cfd34a93d9814d291c4e97c66885f8e7dcd5e53400b24d62a3f11409cd8166f -size 1696148 +oid sha256:50f25966729ee229e150af57316acd8e8896993a856b6f0283a79063a7ca46df +size 1768885 diff --git a/.data/c360-fashion-retail/postgres/style_similarity_matches.csv b/.data/c360-fashion-retail/postgres/style_similarity_matches.csv index 6f74207f..3f1e885d 100644 --- a/.data/c360-fashion-retail/postgres/style_similarity_matches.csv +++ b/.data/c360-fashion-retail/postgres/style_similarity_matches.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7e2e40290f1482733086b7a0eae5c827108a56cf198c90f402cb7795ed056b82 -size 508578 +oid sha256:321f1e711e3b27ec4a31bf29dbe3d97be683c02520c639e0ff99768a127b91f2 +size 645231 diff --git a/.data/c360-fashion-retail/postgres/website_sessions.csv b/.data/c360-fashion-retail/postgres/website_sessions.csv index b1a80307..a4a717a7 100644 --- a/.data/c360-fashion-retail/postgres/website_sessions.csv +++ b/.data/c360-fashion-retail/postgres/website_sessions.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ae3e162daedd21cdfe31b95e5c96af8262c951dc16d6b4bbc78e4fbd9778a0ee -size 5701514 +oid sha256:30c4316c5fdc12b24672992815a0c96b2b4977cc8b51f5ed6b456955fe4fbe1f +size 6122747 diff --git a/demos/c360-fashion-retail/.env b/demos/c360-fashion-retail/.env index 83c886d5..fa65a83e 100644 --- a/demos/c360-fashion-retail/.env +++ b/demos/c360-fashion-retail/.env @@ -4,7 +4,7 @@ APP_POSTGRES_RETAIL_HASURA_CONNECTOR_PORT=7358 APP_POSTGRES_RETAIL_HASURA_LOG_LEVEL="trace" APP_POSTGRES_RETAIL_HASURA_SERVICE_TOKEN_SECRET="xoLyXuxTAz-BPza1WamCdg==" APP_POSTGRES_RETAIL_JDBC_SCHEMAS="public" -APP_POSTGRES_RETAIL_JDBC_URL="jdbc:postgresql://host.docker.internal:5433/retail_fashion?user=postgres&password=hbGciOiJIUzI1NiIsInR5cCI6IkpX" +APP_POSTGRES_RETAIL_JDBC_URL="jdbc:postgresql://host.docker.internal:5433/retail_fashion?user=postgres&password=hasura123" APP_POSTGRES_RETAIL_OTEL_EXPORTER_OTLP_ENDPOINT="http://local.hasura.dev:4317" APP_POSTGRES_RETAIL_OTEL_SERVICE_NAME="app_postgres_retail" APP_POSTGRES_RETAIL_READ_URL="http://local.hasura.dev:7358" diff --git a/demos/c360-fashion-retail/app/metadata/postgres_retail-types.hml b/demos/c360-fashion-retail/app/metadata/postgres_retail-types.hml index da30fd4d..a25236ec 100644 --- a/demos/c360-fashion-retail/app/metadata/postgres_retail-types.hml +++ b/demos/c360-fashion-retail/app/metadata/postgres_retail-types.hml @@ -1438,3 +1438,107 @@ definition: graphql: expressionTypeName: timestamptz_order_by_exp +--- +kind: BooleanExpressionType +version: v1 +definition: + name: string_bool_exp + operand: + scalar: + type: string + comparisonOperators: + - name: _eq + argumentType: string! + - name: _gt + argumentType: string! + - name: _gte + argumentType: string! + - name: _ilike + argumentType: string! + - name: _in + argumentType: "[string!]!" + - name: _iregex + argumentType: string! + - name: _like + argumentType: string! + - name: _lt + argumentType: string! + - name: _lte + argumentType: string! + - name: _neq + argumentType: string! + - name: _nilike + argumentType: string! + - name: _niregex + argumentType: string! + - name: _nlike + argumentType: string! + - name: _nregex + argumentType: string! + - name: _regex + argumentType: string! + dataConnectorOperatorMapping: + - dataConnectorName: postgres_retail + dataConnectorScalarType: string + operatorMapping: {} + logicalOperators: + enable: true + isNull: + enable: true + graphql: + typeName: string_bool_exp + +--- +kind: DataConnectorScalarRepresentation +version: v1 +definition: + dataConnectorName: postgres_retail + dataConnectorScalarType: string + representation: string + graphql: + comparisonExpressionTypeName: string_comparison_exp_8 + +--- +kind: BooleanExpressionType +version: v1 +definition: + name: int_bool_exp + operand: + scalar: + type: int_32 + comparisonOperators: + - name: _eq + argumentType: int_32! + - name: _gt + argumentType: int_32! + - name: _gte + argumentType: int_32! + - name: _in + argumentType: "[int_32!]!" + - name: _lt + argumentType: int_32! + - name: _lte + argumentType: int_32! + - name: _neq + argumentType: int_32! + dataConnectorOperatorMapping: + - dataConnectorName: postgres_retail + dataConnectorScalarType: int + operatorMapping: {} + logicalOperators: + enable: true + isNull: + enable: true + graphql: + typeName: int_bool_exp + +--- +kind: DataConnectorScalarRepresentation +version: v1 +definition: + dataConnectorName: postgres_retail + dataConnectorScalarType: int + representation: int_32 + graphql: + comparisonExpressionTypeName: int_32_comparison_exp_1 + diff --git a/demos/c360-fashion-retail/compose.yaml b/demos/c360-fashion-retail/compose.yaml index 6e246006..13018db0 100644 --- a/demos/c360-fashion-retail/compose.yaml +++ b/demos/c360-fashion-retail/compose.yaml @@ -1,6 +1,33 @@ include: - path: app/connector/postgres_retail/compose.yaml services: + # PostgreSQL Database with automatic data generation and import + postgres_retail: + image: postgres:17-alpine + environment: + POSTGRES_DB: retail_fashion + POSTGRES_USER: postgres + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-hasura123} + POSTGRES_INITDB_ARGS: "--encoding=UTF-8 --lc-collate=en_US.UTF-8 --lc-ctype=en_US.UTF-8" + ports: + - "5433:5433" + volumes: + - postgres_data:/var/lib/postgresql/data + - ../../.data/c360-fashion-retail/postgres:/docker-entrypoint-initdb.d + command: > + postgres + -c port=5433 + -c shared_preload_libraries=pg_stat_statements + -c pg_stat_statements.track=all + -c max_connections=200 + healthcheck: + test: ["CMD-SHELL", "pg_isready -U postgres -d retail_fashion -p 5433"] + interval: 10s + timeout: 5s + retries: 5 + networks: + - c360-network + engine: build: context: engine @@ -19,6 +46,11 @@ services: io.hasura.ddn.service-name: engine ports: - 3280:3000 + depends_on: + postgres_retail: + condition: service_healthy + networks: + - c360-network otel-collector: command: - --config=/etc/otel-collector-config.yaml @@ -26,10 +58,13 @@ services: HASURA_DDN_PAT: ${HASURA_DDN_PAT} image: otel/opentelemetry-collector:0.104.0 ports: - - 4317:4317 + - 14317:4317 - 4318:4318 volumes: - ./otel-collector-config.yaml:/etc/otel-collector-config.yaml + networks: + - c360-network + promptql-playground: environment: CONFIG_PATH: /app/promptql_config.json @@ -47,3 +82,18 @@ services: volumes: - ./engine/build/promptql_config.json:/app/promptql_config.json - ./.promptql_playground.db:/app/promptql_playground/promptql_playground.db + depends_on: + postgres_retail: + condition: service_healthy + engine: + condition: service_started + networks: + - c360-network + +volumes: + postgres_data: + driver: local + +networks: + c360-network: + driver: bridge diff --git a/demos/c360-fashion-retail/globals/metadata/promptql-config.hml b/demos/c360-fashion-retail/globals/metadata/promptql-config.hml index 727b63fb..3a287cef 100644 --- a/demos/c360-fashion-retail/globals/metadata/promptql-config.hml +++ b/demos/c360-fashion-retail/globals/metadata/promptql-config.hml @@ -4,6 +4,7 @@ definition: featureFlags: enable_visualizations_v3: true enable_automations: true + enable_confidence_scores: true llm: provider: hasura specificLlm: