HopDatabase/run_scrapers.py at main · kasperg3/HopDatabase

344 lines (283 loc) · 15.1 KB
#!/usr/bin/env python3
Main scraper and merger script for HopDatabase
This script runs all hop scrapers, combines the raw data into 'hops.json',
and then merges the data into a standardized 'merged_hops.json' file.
import json
from collections import defaultdict
from typing import Dict, List, Optional, Union
# Import the data model and scrapers
from hop_database.models.hop_model import HopEntry, save_hop_entries
from hop_database.scrapers import yakima_chief, barth_haas, hopsteiner, crosby_hops, john_i_haas, yakima_valley_hops, hops_australia
COUNTRY_ALIASES = {
    "united states": "USA",
    "united states of america": "USA",
    "us": "USA",
    "great britain": "United Kingdom",
    "uk": "United Kingdom",
    "england": "United Kingdom",
def normalize_country(country: str) -> str:
    """Normalizes country names to a consistent form."""
    return COUNTRY_ALIASES.get(country.strip().lower(), country.strip())
def normalize_hop_name(name):
    """Normalizes hop names for consistent grouping."""
    name = name.lower()
    name = re.sub(r'(\\u00aeBrand|®Brand)', '', name)
    name = re.sub(r'\(.*?\)', '', name)
    name = re.sub(r'[®™\'()]', '', name)
    name = re.sub(r'brand', '', name)
    name = re.sub(r'\s*-\s*\w{2,3}$', '', name)
    # Strip trailing " hops" or " hop" suffix (common in Yakima Valley Hops names)
    name = re.sub(r'\s+hops?$', '', name)
    return name.strip()
def get_safe_float(value, default=0.0):
    """Safely converts a value to a float."""
    if value is None or value == '': return default
    try: return float(value)
    except (ValueError, TypeError): return default
INVALID_HOP_NAMES = {"hop varieties", "hop variety", "all hops", "unknown"}
# Add a mapping for known equivalent hop names
MERGE_NAME_ALIASES = {
    "hallertauer mittelfrüher": "hallertauer mittelfrüh",
    "hallertauer mittelfrueh": "hallertauer mittelfrüh",
    "hallertau mittelfrüh": "hallertauer mittelfrüh",
    "east kent goldings" : "east kent golding",
    "fuggle" : "fuggles"
    # Add more aliases as needed
def scale_aroma_values_by_source(hops_data: List[HopEntry]) -> List[HopEntry]:
    Scale aroma values to 0-5 range based on the maximum value found for each source.
    This ensures each producer's data is properly normalized relative to their own scale.
    # Group hops by source to analyze their aroma ranges
    source_max_values = defaultdict(lambda: defaultdict(float))
    # First pass: find the maximum aroma value for each source and aroma category
    for hop in hops_data:
        if hop.source and isinstance(hop.standardized_aromas, dict):
            for aroma_category, value in hop.standardized_aromas.items():
                if isinstance(value, (int, float)) and value > 0:
                    current_max = source_max_values[hop.source][aroma_category]
                    source_max_values[hop.source][aroma_category] = max(current_max, float(value))
    # Calculate overall max for each source (across all aroma categories)
    source_overall_max = {}
    for source, aroma_maxes in source_max_values.items():
        if aroma_maxes:
            source_overall_max[source] = max(aroma_maxes.values())
        else:
            source_overall_max[source] = 1.0  # Default to prevent division by zero
    print("\nAroma scaling analysis by source:")
    for source, max_val in source_overall_max.items():
        print(f"  {source}: max aroma value = {max_val}")
    # Track which sources need scaling to avoid duplicate messages
    scaled_sources = set()
    # Second pass: scale the aroma values
    for hop in hops_data:
        if hop.source and isinstance(hop.standardized_aromas, dict):
            overall_max = source_overall_max.get(hop.source, 1.0)
            # Only scale if the max value is greater than 5
            if overall_max > 5:
                scale_factor = 5.0 / overall_max
                # Print scaling message only once per source
                if hop.source not in scaled_sources:
                    print(f"  Scaling {hop.source} by factor {scale_factor:.3f}")
                    scaled_sources.add(hop.source)
                for aroma_category in hop.standardized_aromas:
                    original_value = hop.standardized_aromas[aroma_category]
                    if isinstance(original_value, (int, float)) and original_value > 0:
                        scaled_value = original_value * scale_factor
                        # Round to 1 decimal place and ensure it's within 0-5
                        hop.standardized_aromas[aroma_category] = min(5.0, max(0.0, round(scaled_value, 1)))
            else:
                # Even if no scaling needed, ensure values are clamped to 0-5 and rounded
                for aroma_category in hop.standardized_aromas:
                    original_value = hop.standardized_aromas[aroma_category]
                    if isinstance(original_value, (int, float)):
                        hop.standardized_aromas[aroma_category] = min(5.0, max(0.0, round(float(original_value), 1)))
    return hops_data
def merge_hops(hops_data: List[HopEntry]) -> List[HopEntry]:
    """Merges a list of HopEntry objects into a standardized list."""
    grouped_hops = defaultdict(list)
    for hop in hops_data:
        normalized_name = normalize_hop_name(hop.name)
        normalized_name = MERGE_NAME_ALIASES.get(normalized_name, normalized_name)
        if normalized_name and normalized_name not in INVALID_HOP_NAMES:
            grouped_hops[normalized_name].append(hop)
    merged_hops = []
    for name, entries in grouped_hops.items():
        if not entries: continue
        final_hop = HopEntry(name=name.capitalize())
        all_notes = set()
        all_countries = []
        all_sources = set()
        all_hrefs = []
        all_standardized_aromas = []
        all_storage = []
        all_descriptions = []
        range_values = defaultdict(lambda: {'from': [], 'to': []})
        additional_props_values = defaultdict(lambda: {'from': [], 'to': []})
        # Collect product variants, merged by type using min/max for numeric range fields
        all_product_variants: Dict[str, Dict] = {}
        _range_min_keys = {"alpha_from", "beta_from", "oil_from", "co_h_from"}
        _range_max_keys = {"alpha_to", "beta_to", "oil_to", "co_h_to"}
        for hop in entries:
            all_notes.update([note.strip().lower() for note in hop.notes if note])
            if hop.country: all_countries.append(normalize_country(hop.country))
            if hop.source: all_sources.add(hop.source)
            if hop.href: all_hrefs.append(hop.href)
            if hop.storage: all_storage.append(hop.storage)
            if hop.description: all_descriptions.append(hop.description)
            if isinstance(hop.standardized_aromas, dict):
                all_standardized_aromas.append(hop.standardized_aromas)
            for key in ["alpha", "beta", "oil", "co_h"]:
                range_values[key]['from'].append(get_safe_float(getattr(hop, f"{key}_from")))
                range_values[key]['to'].append(get_safe_float(getattr(hop, f"{key}_to")))
            for prop_key, prop_val in hop.additional_properties.items():
                if prop_key.endswith("_from"):
                    base_key = prop_key[:-5]
                    additional_props_values[base_key]['from'].append(get_safe_float(prop_val))
                elif prop_key.endswith("_to"):
                    base_key = prop_key[:-3]
                    additional_props_values[base_key]['to'].append(get_safe_float(prop_val))
            # Merge product variants by type using min/max for numeric range fields
            for variant in getattr(hop, 'product_variants', []):
                variant_type = variant.get("type", "")
                if not variant_type:
                    continue
                existing = all_product_variants.get(variant_type)
                if existing is None:
                    all_product_variants[variant_type] = dict(variant)
                    continue
                merged = dict(existing)
                for key, value in variant.items():
                    if value in (None, ""):
                        continue
                    if key in _range_min_keys:
                        new_val = get_safe_float(value)
                        existing_num = get_safe_float(merged.get(key))
                        if existing_num == 0:
                            merged[key] = value
                        elif new_val > 0:
                            merged[key] = str(min(existing_num, new_val))
                    elif key in _range_max_keys:
                        new_val = get_safe_float(value)
                        existing_num = get_safe_float(merged.get(key))
                        if existing_num == 0:
                            merged[key] = value
                        elif new_val > 0:
                            merged[key] = str(max(existing_num, new_val))
                        if not merged.get(key):
                            merged[key] = value
                all_product_variants[variant_type] = merged
        final_hop.notes = sorted(list(all_notes))
        final_hop.description = all_descriptions[0] if all_descriptions else ""
        final_hop.country = all_countries[0] if all_countries else ""
        final_hop.source = " / ".join(sorted(list(all_sources)))
        # Store all unique hrefs, separated by " | " for multiple sources
        unique_hrefs = list(dict.fromkeys(all_hrefs))  # Remove duplicates while preserving order
        final_hop.href = " | ".join(unique_hrefs) if unique_hrefs else ""
        # Use the first available storage value (deduplicated)
        unique_storage = list(dict.fromkeys(all_storage))
        final_hop.storage = " / ".join(unique_storage) if unique_storage else ""
        for key, values in range_values.items():
            from_vals = [v for v in values['from'] if v > 0]
            to_vals = [v for v in values['to'] if v > 0]
            setattr(final_hop, f"{key}_from", min(from_vals) if from_vals else 0.0)
            setattr(final_hop, f"{key}_to", max(to_vals) if to_vals else 0.0)
        aroma_aggregator = defaultdict(lambda: {'sum': 0, 'count': 0})
        for aroma_dict in all_standardized_aromas:
            for aroma, value in aroma_dict.items():
                if value > 0:
                    aroma_aggregator[aroma]['sum'] += value
                    aroma_aggregator[aroma]['count'] += 1
        # Re-initialize aromas in final_hop before populating
        final_hop.standardized_aromas = {aroma: 0 for aroma in final_hop.standardized_aromas}
        for aroma, data in aroma_aggregator.items():
            if data['count'] > 0:
                final_hop.standardized_aromas[aroma] = round(data['sum'] / data['count'], 2)
        for base_key, values in additional_props_values.items():
            from_vals = [v for v in values['from'] if v > 0]
            to_vals = [v for v in values['to'] if v > 0]
            final_hop.additional_properties[f"{base_key}_from"] = min(from_vals) if from_vals else 0.0
            final_hop.additional_properties[f"{base_key}_to"] = max(to_vals) if to_vals else 0.0
        # Attach collected product variants (sorted by type name for consistency)
        final_hop.product_variants = sorted(
            list(all_product_variants.values()), key=lambda v: v.get("type", "")
        merged_hops.append(final_hop)
    return merged_hops
def _require_hops(results: list, source: str, min_count: int = 1) -> list:
    """Raise an error if a scraper returned fewer hops than expected."""
    if len(results) < min_count:
        raise RuntimeError(
            f"Scraper '{source}' returned {len(results)} hops (expected >= {min_count}). "
            "Data collection is incomplete — aborting."
    return results
def main():
    """Run all scrapers, combine the data, and then merge it."""
    print("Starting hop data scraping...")
    # --- Run all scrapers ---
    print("\nScraping Yakima Chief Hops...")
    ych = _require_hops(yakima_chief.scrape(save=False), "Yakima Chief Hops (US)")
    print(f"Found {len(ych)} hops from Yakima Chief")
    ych_eu = _require_hops(
        yakima_chief.scrape(url="https://www.yakimachief.eu/commercial/hop-varieties.html?product_list_limit=all", save=False),
        "Yakima Chief Hops (EU)",
    print(f"Found {len(ych_eu)} hops from Yakima Chief EU")
    ych_us_names = {hop.name for hop in ych}
    ych_combined = ych + [hop for hop in ych_eu if hop.name not in ych_us_names]
    print("\nScraping Barth Haas...")
    bh = _require_hops(barth_haas.scrape(save=False), "Barth Haas")
    print(f"Found {len(bh)} hops from Barth Haas")
    print("\nScraping Hopsteiner...")
    hs = _require_hops(hopsteiner.scrape(save=False), "Hopsteiner")
    print(f"Found {len(hs)} hops from Hopsteiner")
    print("\nScraping Crosby Hops...")
    crosby = _require_hops(crosby_hops.scrape(save=False), "Crosby Hops")
    print(f"Found {len(crosby)} hops from Crosby Hops")
    print("\nScraping John I. Haas...")
    jih = _require_hops(john_i_haas.scrape(save=False), "John I. Haas")
    print(f"Found {len(jih)} hops from John I. Haas")
    print("\nScraping Yakima Valley Hops...")
    yvh = _require_hops(yakima_valley_hops.scrape(save=False), "Yakima Valley Hops")
    print(f"Found {len(yvh)} hops from Yakima Valley Hops")
    print("\nScraping Hop Products Australia (hops.com.au)...")
    hpa = _require_hops(hops_australia.scrape(save=False), "Hop Products Australia")
    print(f"Found {len(hpa)} hops from Hop Products Australia")
    # --- Combine all entries ---
    combined_hop_entries = ych_combined + bh + hs + crosby + jih + yvh + hpa
    print(f"\nTotal raw hop entries: {len(combined_hop_entries)}")
    # --- Scale aroma values by source before merging ---
    print("\nScaling aroma values by source...")
    scaled_hop_entries = scale_aroma_values_by_source(combined_hop_entries)
    # --- Run the merger on the scaled data ---
    print("\nStarting hop data merging...")
    merged_data = merge_hops(scaled_hop_entries)
    print(f"Total merged hop entries: {len(merged_data)}")
    # Sort final data by name
    merged_data.sort(key=lambda hop: hop.name)
    # Save to data directory for CI/CD pipeline
    data_dir = os.path.join(os.path.dirname(__file__), 'data')
    os.makedirs(data_dir, exist_ok=True)
    # Save as hops.json (primary output)
    hops_json_path = os.path.join(data_dir, 'hops.json')
    save_hop_entries(merged_data, hops_json_path)
    print(f"Saved merged data to {hops_json_path}")
    # Save as combined.json (for releases/backward compatibility)
    combined_json_path = os.path.join(data_dir, 'combined.json')
    save_hop_entries(merged_data, combined_json_path)
    print(f"Saved merged data to {combined_json_path}")
    # Also save to website data directory for local development
    website_data_path = os.path.join(os.path.dirname(__file__), 'website', 'public', 'data', 'hops.json')
    if os.path.exists(os.path.dirname(website_data_path)):
        save_hop_entries(merged_data, website_data_path)
        print(f"Merged data also saved to {website_data_path}")
if __name__ == "__main__":
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

run_scrapers.py

Latest commit

History

run_scrapers.py

File metadata and controls