"""
Generador de 'estrellas' sintéticas para pruebas (NO para publicar en plataformas reales).
Requisitos: pip install pandas numpy faker
"""
import random
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
fake = Faker('es_ES')
def generate_ratings(n=1000,
n_products=50,
n_users=500,
start_date="2024-01-01",
end_date=None,
baseline_distribution=None):
"""
Genera un DataFrame con columnas:
['rating_id','user_id','product_id','stars','timestamp','label','notes']
- baseline_distribution: dict opcional {1:0.1,2:0.1,3:0.2,4:0.3,5:0.3}
"""
if end_date is None:
end_date = datetime.now().strftime("%Y-%m-%d")
start = datetime.fromisoformat(start_date)
end = datetime.fromisoformat(end_date)
if baseline_distribution is None:
baseline_distribution = {1:0.05, 2:0.1, 3:0.2, 4:0.35, 5:0.3}
stars_choices = []
weights = [baseline_distribution.get(i, 0.0) for i in range(1,6)]
# normalize if necessary
total_w = sum(weights)
if total_w <= 0:
weights = [0.2]*5
else:
weights = [w/total_w for w in weights]
rows = []
for i in range(n):
user_id = f"user_{random.randint(1, n_users)}"
product_id = f"prod_{random.randint(1, n_products)}"
stars = random.choices([1,2,3,4,5], weights=weights, k=1)[0]
# random timestamp between start and end
delta = end - start
rand_seconds = random.randint(0, int(delta.total_seconds()))
ts = start + timedelta(seconds=rand_seconds)
rows.append({
"rating_id": f"r{i+1}",
"user_id": user_id,
"product_id": product_id,
"stars": stars,
"timestamp": ts.isoformat(),
"label": "synthetic",
"notes": ""
})
df = pd.DataFrame(rows)
return df
def inject_bot_cluster(df, product_id, n_bot_ratings=50, star=5, start_ts=None):
"""
Añade un cluster de 'bot' ratings para un producto determinado:
- n_bot_ratings: cantidad de reviews desde usuarios ficticios
- star: número de estrellas asignadas por los bots
- start_ts: datetime de inicio del burst; si None usa ahora()
Marca estos registros como 'synthetic:bot' en notes.
"""
if start_ts is None:
start_ts = datetime.now()
bots = []
for i in range(n_bot_ratings):
bot_user = f"bot_{random.randint(100000,999999)}"
# repartir en un intervalo corto (p. ej. 1 hora)
rand_sec = random.randint(0, 3600)
ts = start_ts + timedelta(seconds=rand_sec)
bots.append({
"rating_id": f"bot_{product_id}_{i+1}_{random.randint(1,9999)}",
"user_id": bot_user,
"product_id": product_id,
"stars": star,
"timestamp": ts.isoformat(),
"label": "synthetic",
"notes": "bot_cluster"
})
df2 = pd.DataFrame(bots)
return pd.concat([df, df2], ignore_index=True)
def detect_anomalous_products(df, z_thresh=3.0):
"""
Detección simple: para cada product_id calcula media y std de stars,
y marca products cuya media está muy alta/baja respecto al conjunto (z-score).
Retorna DataFrame de productos con stats y zscore.
"""
agg = df.groupby('product_id')['stars'].agg(['mean','std','count']).reset_index()
overall_mean = df['stars'].mean()
overall_std = df['stars'].std(ddof=0) if df['stars'].std(ddof=0)>0 else 1.0
agg['zscore'] = (agg['mean'] - overall_mean) / overall_std
agg['suspicious'] = agg['zscore'].abs() > z_thresh
return agg.sort_values('zscore', ascending=False)
# --- EJEMPLO DE USO ---
if __name__ == "__main__":
# generar 2000 ratings sintéticos
df = generate_ratings(n=2000, n_products=200, n_users=1000, start_date="2024-01-01", end_date="2025-11-05")
# inyectar un cluster bot mitificado en prod_42
df = inject_bot_cluster(df, product_id="prod_42", n_bot_ratings=120, star=5, start_ts=datetime(20_