diff --git a/README.md b/README.md index f00e855..6f34cb8 100644 --- a/README.md +++ b/README.md @@ -36,20 +36,21 @@ The primary key `(channel_id, start_ts, end_ts)` prevents duplicates when the AP ### Visualizing collected ads -The helper `visualizer.py` script analyzes and visualizes ad data from the database: +The `visualizer/main.py` script analyzes and visualizes ad data from the database: ```bash # Process all channels (default) -uv run python utils/visualizer.py +uv run ./visualizer/main.py # Process a specific channel -uv run python utils/visualizer.py +uv run ./visualizer/main.py # Filter by date range -uv run python utils/visualizer.py --start-date 2025-11-28 --end-date 2025-12-21 +uv run ./visualizer/main.py --start-date 2025-11-28 --end-date 2025-12-21 +``` # Single channel with date filter -uv run python utils/visualizer.py --start-date 2025-11-28 +uv run ./visualizer/main.py --start-date 2025-11-28 ``` **Single channel mode** displays: @@ -57,7 +58,7 @@ uv run python utils/visualizer.py --start-date 2025-11-28 - A 24h profile (bars = average ad minutes per day, line = average break count) - A minute-vs-hour heatmap showing ad coverage -**All channels mode** generates additional visualizations saved to `visualizer/`: +**All channels mode** generates additional visualizations saved to `visualizer_output/`: - Combined hourly profile and heatmap for each channel - Weekday analysis per channel (ad breaks by day of week, weekday×hour heatmap) - Weekly ad patterns overview across all channels diff --git a/visualizer/data_loader.py b/visualizer/data_loader.py index 267f38e..95a28bd 100644 --- a/visualizer/data_loader.py +++ b/visualizer/data_loader.py @@ -1,15 +1,18 @@ +"""Data loading utilities for the ad visualizer.""" + import sqlite3 from typing import Sequence, List, Optional from pathlib import Path import sys +from utils.scrap import DB_PATH, get_connection + # Allow running as a script from anywhere sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent)) -from utils.scrap import DB_PATH, get_connection - Row = Sequence + def load_ads_data( channel_id: str, start_date: Optional[str] = None, end_date: Optional[str] = None ) -> List[Row]: @@ -50,4 +53,4 @@ def list_channels() -> List[str]: ) return [row[0] for row in cursor.fetchall()] finally: - conn.close() \ No newline at end of file + conn.close() diff --git a/visualizer/main.py b/visualizer/main.py index db7bc67..a30d350 100644 --- a/visualizer/main.py +++ b/visualizer/main.py @@ -27,6 +27,7 @@ from visualizer.plotter import ( from visualizer.text_output import print_stats, build_overview_text from visualizer.utils import CHANNELS_DATA + def process_all_channels(start_date, end_date) -> None: """Process all channels in the database and generate visualizations.""" output_dir = Path("visualizer_output") @@ -35,8 +36,8 @@ def process_all_channels(start_date, end_date) -> None: file.unlink() channel_ids = list_channels() - all_channels_plot_data = [] # Data for combined weekday plots - all_channels_ranking_data = [] # Data for channel rankings + all_channels_plot_data = [] # Data for combined weekday plots + all_channels_ranking_data = [] # Data for channel rankings for channel_id in channel_ids: print(f"Processing channel {channel_id}...") @@ -46,14 +47,30 @@ def process_all_channels(start_date, end_date) -> None: hourly_profile = compute_hourly_profile(rows) heatmap = compute_heatmap(rows) - plot_combined(channel_id, hourly_profile, heatmap, stats=stats, save=True, output_dir=output_dir, channels_data=CHANNELS_DATA, build_overview_text_func=build_overview_text) + plot_combined( + channel_id, + hourly_profile, + heatmap, + stats=stats, + save=True, + output_dir=output_dir, + channels_data=CHANNELS_DATA, + build_overview_text_func=build_overview_text, + ) weekday_profile = compute_weekday_profile(rows) weekday_heatmap = compute_weekday_hour_heatmap(rows) weekday_hour_counts = compute_weekday_hour_counts(rows) plot_weekday_channel( - channel_id, weekday_profile, weekday_hour_counts, stats=stats, save=True, output_dir=output_dir, channels_data=CHANNELS_DATA, build_overview_text_func=build_overview_text + channel_id, + weekday_profile, + weekday_hour_counts, + stats=stats, + save=True, + output_dir=output_dir, + channels_data=CHANNELS_DATA, + build_overview_text_func=build_overview_text, ) all_channels_plot_data.append( @@ -71,8 +88,18 @@ def process_all_channels(start_date, end_date) -> None: } ) - plot_weekday_overview(all_channels_plot_data, save=True, output_dir=output_dir, channels_data=CHANNELS_DATA) - plot_channel_rankings(all_channels_ranking_data, save=True, output_dir=output_dir, channels_data=CHANNELS_DATA) + plot_weekday_overview( + all_channels_plot_data, + save=True, + output_dir=output_dir, + channels_data=CHANNELS_DATA, + ) + plot_channel_rankings( + all_channels_ranking_data, + save=True, + output_dir=output_dir, + channels_data=CHANNELS_DATA, + ) def main() -> None: @@ -110,10 +137,24 @@ def main() -> None: if not args.no_plot: hourly_profile = compute_hourly_profile(rows) - plot_hourly_profile(args.channel_id, hourly_profile, stats=stats, output_dir=Path("visualizer_output"), channels_data=CHANNELS_DATA, build_overview_text_func=build_overview_text) + plot_hourly_profile( + args.channel_id, + hourly_profile, + stats=stats, + output_dir=Path("visualizer_output"), + channels_data=CHANNELS_DATA, + build_overview_text_func=build_overview_text, + ) heatmap = compute_heatmap(rows) - plot_heatmap(args.channel_id, heatmap, stats=stats, output_dir=Path("visualizer_output"), channels_data=CHANNELS_DATA, build_overview_text_func=build_overview_text) + plot_heatmap( + args.channel_id, + heatmap, + stats=stats, + output_dir=Path("visualizer_output"), + channels_data=CHANNELS_DATA, + build_overview_text_func=build_overview_text, + ) if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/visualizer/plotter.py b/visualizer/plotter.py index 77af7fb..011a150 100644 --- a/visualizer/plotter.py +++ b/visualizer/plotter.py @@ -1,7 +1,11 @@ -import matplotlib.pyplot as plt -from matplotlib import font_manager as font_manager +"""Plotting utilities for the ad visualizer.""" + from pathlib import Path -from typing import Dict, List, Callable +from typing import Dict, List, Callable, Optional +import matplotlib.pyplot as plt +from matplotlib import font_manager + +from .utils import format_duration, get_channel_name FPATH = "libs/LibertinusSerif-Regular.otf" prop = font_manager.FontProperties(fname=FPATH, size=14) @@ -13,13 +17,9 @@ try: if font_name: plt.rcParams["font.family"] = font_name plt.rcParams["font.size"] = prop.get_size() -except ( - Exception -): # pylint: disable=broad-exception-caught # pragma: no cover - optional font may be missing +except (OSError, ValueError): font_name = None -# Renamed _format_duration and _human_ts to be accessible -from visualizer.utils import format_duration, human_ts, CHANNELS_DATA def plot_hourly_profile( channel_id: str, @@ -27,10 +27,12 @@ def plot_hourly_profile( stats: Dict | None = None, save: bool = False, output_dir: Path = Path("."), - channels_data: Dict = {}, - build_overview_text_func: Callable[[str, Dict], str] = lambda x, y: "" + channels_data: Optional[Dict] = None, + build_overview_text_func: Callable[[str, Dict], str] = lambda x, y: "", ) -> None: """Plot the average ad activity per hour of day.""" + if channels_data is None: + channels_data = {} if not profile or not profile.get("days"): print("No data available or not enough distinct days for the hourly plot.") return @@ -55,10 +57,7 @@ def plot_hourly_profile( ax_right.plot(hours, avg_counts, color="tab:orange", marker="o") ax_right.set_ylabel("Avg number of breaks", color="tab:orange", fontproperties=prop) - channel_name = channel_id - for ch_id, channel_info in (channels_data or {}).items(): - if ch_id == channel_id: - channel_name = channel_info["name"] + channel_name = get_channel_name(channel_id, channels_data) for t in ax_left.get_yticklabels(): t.set_fontproperties(prop) @@ -74,7 +73,9 @@ def plot_hourly_profile( ) if stats: - overview_text = build_overview_text_func(channel_id, stats, channels_data=channels_data) + overview_text = build_overview_text_func( + channel_id, stats, channels_data=channels_data + ) fig.text( 0.73, 0.5, @@ -104,10 +105,12 @@ def plot_heatmap( stats: Dict | None = None, save: bool = False, output_dir: Path = Path("."), - channels_data: Dict = {}, - build_overview_text_func: Callable[[str, Dict], str] = lambda x, y: "" + channels_data: Optional[Dict] = None, + build_overview_text_func: Callable[[str, Dict], str] = lambda x, y: "", ) -> None: """Plot a heatmap of ad minute coverage by minute of hour and hour of day.""" + if channels_data is None: + channels_data = {} if not heatmap_data or not heatmap_data.get("days"): print("No data available or not enough distinct days for the heatmap plot.") return @@ -137,10 +140,7 @@ def plot_heatmap( cbar = fig.colorbar(im, ax=ax) cbar.set_label("Share of minute spent in ads per day", fontproperties=prop) - channel_name = channel_id - for ch_id, channel_info in (channels_data or {}).items(): - if ch_id == channel_id: - channel_name = channel_info["name"] + channel_name = get_channel_name(channel_id, channels_data) fig.suptitle( ( @@ -151,7 +151,9 @@ def plot_heatmap( ) if stats: - overview_text = build_overview_text_func(channel_id, stats, channels_data=channels_data) + overview_text = build_overview_text_func( + channel_id, stats, channels_data=channels_data + ) fig.text( 0.73, 0.5, @@ -182,10 +184,12 @@ def plot_combined( stats: Dict | None = None, save: bool = False, output_dir: Path = Path("."), - channels_data: Dict = {}, - build_overview_text_func: Callable[[str, Dict], str] = lambda x, y: "" + channels_data: Optional[Dict] = None, + build_overview_text_func: Callable[[str, Dict], str] = lambda x, y: "", ) -> None: """Plot both hourly profile and heatmap in a single figure with the overview text box.""" + if channels_data is None: + channels_data = {} if not profile or not profile.get("days"): print("No data available for the hourly plot.") return @@ -193,11 +197,7 @@ def plot_combined( print("No data available for the heatmap plot.") return - channel_name = channel_id - for ch_id, channel_info in (channels_data or {}).items(): - if ch_id == channel_id: - channel_name = channel_info["name"] - break + channel_name = get_channel_name(channel_id, channels_data) fig, (ax_hourly, ax_heatmap) = plt.subplots(2, 1, figsize=(14, 10)) @@ -262,7 +262,9 @@ def plot_combined( ) if stats: - overview_text = build_overview_text_func(channel_id, stats, channels_data=channels_data) + overview_text = build_overview_text_func( + channel_id, stats, channels_data=channels_data + ) fig.text( 0.73, 0.5, @@ -290,7 +292,7 @@ def plot_weekday_overview( all_channels_data: List[Dict], save: bool = False, output_dir: Path = Path("."), - channels_data: Dict = {} + channels_data: Optional[Dict] = None, ) -> None: """ Plot a weekday overview for all channels. @@ -298,6 +300,8 @@ def plot_weekday_overview( - A bar showing number of ads per weekday - A horizontal heatmap strip showing ad coverage by weekday x hour """ + if channels_data is None: + channels_data = {} if not all_channels_data: print("No data available for weekday overview.") return @@ -315,11 +319,7 @@ def plot_weekday_overview( for data in all_channels_data: channel_id = data["channel_id"] - channel_name = channel_id - for ch_id, channel_info in (channels_data or {}).items(): - if ch_id == channel_id: - channel_name = channel_info["name"] - break + channel_name = get_channel_name(channel_id, channels_data) channel_names.append(f"{channel_name}") weekday_profile = data.get("weekday_profile", {}) @@ -335,15 +335,13 @@ def plot_weekday_overview( normalized_row = [] for weekday in range(7): for hour in range(24): - val = ( - grid[weekday][hour] / max(hm_days_seen[weekday], 1) / 3600 - ) + val = grid[weekday][hour] / max(hm_days_seen[weekday], 1) / 3600 normalized_row.append(min(val, 1.0)) heatmap_plot_data.append(normalized_row) x = range(num_channels) bar_width = 0.12 - colors = plt.cm.tab10(range(7)) + colors = plt.get_cmap("tab10").colors[:7] for i, weekday in enumerate(weekday_names): offsets = [xi + (i - 3) * bar_width for xi in x] @@ -401,27 +399,24 @@ def plot_weekday_channel( stats: Dict | None = None, save: bool = False, output_dir: Path = Path("."), - channels_data: Dict = {}, - build_overview_text_func: Callable[[str, Dict], str] = lambda x, y: "" + channels_data: Optional[Dict] = None, + build_overview_text_func: Callable[[str, Dict], str] = lambda x, y: "", ) -> None: """ Plot a weekday overview for a single channel. - Shows: - Bar chart of ad breaks per weekday - Heatmap of ad break counts by weekday x hour (7 rows x 24 columns) - Stats text box on the right """ + if channels_data is None: + channels_data = {} if not weekday_profile or not weekday_hour_counts: print(f"No weekday data available for channel {channel_id}.") return weekday_names = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] - channel_name = channel_id - for ch_id, channel_info in (channels_data or {}).items(): - if ch_id == channel_id: - channel_name = channel_info["name"] - break + channel_name = get_channel_name(channel_id, channels_data) fig, (ax_bars, ax_heatmap) = plt.subplots(2, 1, figsize=(14, 8)) @@ -499,7 +494,9 @@ def plot_weekday_channel( ) if stats: - overview_text = build_overview_text_func(channel_id, stats, channels_data=channels_data) + overview_text = build_overview_text_func( + channel_id, stats, channels_data=channels_data + ) fig.text( 0.73, 0.5, @@ -527,7 +524,7 @@ def plot_channel_rankings( all_stats: List[Dict], save: bool = False, output_dir: Path = Path("."), - channels_data: Dict = {} + channels_data: Optional[Dict] = None, ) -> None: """ Plot rankings of all channels based on: @@ -535,6 +532,8 @@ def plot_channel_rankings( - Total ad duration - Longest single ad break """ + if channels_data is None: + channels_data = {} if not all_stats: print("No data available for channel rankings.") return @@ -546,11 +545,7 @@ def plot_channel_rankings( if not stats: continue - channel_name = channel_id - for ch_id, channel_info in (channels_data or {}).items(): - if ch_id == channel_id: - channel_name = channel_info["name"] - break + channel_name = get_channel_name(channel_id, channels_data) max_break_duration = stats["max_break"][0] if stats.get("max_break") else 0 @@ -568,7 +563,9 @@ def plot_channel_rankings( print("No channel data for rankings.") return - fig, axes = plt.subplots(1, 3, figsize=(18, max(8, len(channels_data_for_plot) * 0.4))) + fig, axes = plt.subplots( + 1, 3, figsize=(18, max(8, len(channels_data_for_plot) * 0.4)) + ) rankings = [ ("total_ads", "Total Number of Ads", "Number of ad breaks", "tab:blue"), @@ -577,7 +574,9 @@ def plot_channel_rankings( ] for ax, (metric, title, xlabel, color) in zip(axes, rankings): - sorted_data = sorted(channels_data_for_plot, key=lambda x, m=metric: x[m], reverse=True) + sorted_data = sorted( + channels_data_for_plot, key=lambda x, m=metric: x[m], reverse=True + ) names = [d["channel_name"] for d in sorted_data] values = [d[metric] for d in sorted_data] @@ -626,4 +625,4 @@ def plot_channel_rankings( filename = output_dir / "channel_rankings.png" fig.savefig(filename, dpi=300) print(f"Channel rankings saved to {filename}") - plt.close(fig) \ No newline at end of file + plt.close(fig) diff --git a/visualizer/stats_computer.py b/visualizer/stats_computer.py index 6b1a53d..afb9236 100644 --- a/visualizer/stats_computer.py +++ b/visualizer/stats_computer.py @@ -1,3 +1,5 @@ +"""Statistics computation utilities for the visualizer.""" + from collections import defaultdict from datetime import datetime, timedelta import statistics @@ -215,4 +217,4 @@ def compute_weekday_hour_heatmap(rows: Iterable[Row]) -> Dict: return { "grid": heatmap, "days_seen": [len(s) for s in weekday_days_seen], - } \ No newline at end of file + } diff --git a/visualizer/text_output.py b/visualizer/text_output.py index 34ed3ba..cbc6813 100644 --- a/visualizer/text_output.py +++ b/visualizer/text_output.py @@ -1,6 +1,8 @@ -from datetime import datetime +"""Text output utilities for the visualizer.""" + from typing import Dict -from visualizer.utils import format_duration, human_ts, CHANNELS_DATA +from .utils import format_duration, human_ts, CHANNELS_DATA, get_channel_name + def print_stats(channel_id: str, stats: Dict) -> None: """Print formatted ad break statistics to the console.""" @@ -36,18 +38,15 @@ def print_stats(channel_id: str, stats: Dict) -> None: ) -def build_overview_text(channel_id: str, stats: Dict, channels_data: Dict = CHANNELS_DATA) -> str: +def build_overview_text( + channel_id: str, stats: Dict, channels_data: Dict = CHANNELS_DATA +) -> str: """Build a multi-line string with channel overview stats.""" if not stats: return "" - max_break_duration, max_break_row = stats["max_break"] - channel_name = channel_id - for ch_id, channel_info in (channels_data or {}).items(): - if ch_id == channel_id: - channel_name = channel_info["name"] - break + channel_name = get_channel_name(channel_id, channels_data) lines = [ f"Channel: {channel_name} ({channel_id})", @@ -60,4 +59,4 @@ def build_overview_text(channel_id: str, stats: Dict, channels_data: Dict = CHAN f"Longest break: {format_duration(max_break_duration)}", f" ({human_ts(max_break_row[1])} → {human_ts(max_break_row[2])})", ] - return "\n".join(lines) \ No newline at end of file + return "\n".join(lines) diff --git a/visualizer/utils.py b/visualizer/utils.py index 712bd13..f9142a7 100644 --- a/visualizer/utils.py +++ b/visualizer/utils.py @@ -1,16 +1,19 @@ +"""Utility functions for the visualizer.""" + from datetime import datetime -import sys from pathlib import Path from typing import Dict +import sys + +from utils.scrap import fetch_service_plan # Allow running as a script from anywhere sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) -from utils.scrap import fetch_service_plan - # Load CHANNELS_DATA once when this module is imported CHANNELS_DATA: Dict = fetch_service_plan() + def format_duration(seconds: int) -> str: """Format a duration in seconds into a human-readable string (e.g., '1h 2m 3s').""" minutes, secs = divmod(seconds, 60) @@ -24,4 +27,16 @@ def format_duration(seconds: int) -> str: def human_ts(ts_value: int) -> str: """Convert a Unix timestamp to a human-readable date and time string.""" - return datetime.fromtimestamp(ts_value).strftime("%d/%m/%Y at %H:%M:%S") \ No newline at end of file + return datetime.fromtimestamp(ts_value).strftime("%d/%m/%Y at %H:%M:%S") + + +def get_channel_name(channel_id: str, channels_data: Dict = None) -> str: + """Get the channel name from channel_id, or return channel_id if not found.""" + if channels_data is None: + channels_data = CHANNELS_DATA + channel_name = channel_id + for ch_id, channel_info in channels_data.items(): + if ch_id == channel_id: + channel_name = channel_info["name"] + break + return channel_name