ResearchHub
diff --git a/‎src/analytics/constants/personalize_constants.py‎
Lines changed: 108 additions & 0 deletions b/‎src/analytics/constants/personalize_constants.py‎
Lines changed: 108 additions & 0 deletions
diff --git a/‎src/analytics/interactions/interaction_mapper.py‎
Lines changed: 17 additions & 1 deletion b/‎src/analytics/interactions/interaction_mapper.py‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎src/analytics/items/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎src/analytics/items/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/analytics/items/personalize_item_mapper.py‎
Lines changed: 195 additions & 0 deletions b/‎src/analytics/items/personalize_item_mapper.py‎
Lines changed: 195 additions & 0 deletions
@@ -0,0 +1,108 @@
+"""
+Constants for AWS Personalize item data export.
+
+Defines field names, CSV headers, and data types matching the Avro schema.
+"""
+
+# CSV Field Names (matching Avro schema)
+ITEM_ID = "ITEM_ID"
+ITEM_TYPE = "ITEM_TYPE"
+HUB_L1 = "HUB_L1"
+HUB_L2 = "HUB_L2"
+HUB_IDS = "HUB_IDS"
+AUTHOR_IDS = "AUTHOR_IDS"
+CREATION_TIMESTAMP = "CREATION_TIMESTAMP"
+TEXT = "TEXT"
+TITLE = "TITLE"
+UPVOTE_SCORE = "UPVOTE_SCORE"
+BLUESKY_COUNT_TOTAL = "BLUESKY_COUNT_TOTAL"
+TWEET_COUNT_TOTAL = "TWEET_COUNT_TOTAL"
+CITATION_COUNT_TOTAL = "CITATION_COUNT_TOTAL"
+PEER_REVIEW_COUNT_TOTAL = "PEER_REVIEW_COUNT_TOTAL"
+HAS_ACTIVE_BOUNTY = "HAS_ACTIVE_BOUNTY"
+BOUNTY_HAS_SOLUTIONS = "BOUNTY_HAS_SOLUTIONS"
+RFP_IS_OPEN = "RFP_IS_OPEN"
+RFP_HAS_APPLICANTS = "RFP_HAS_APPLICANTS"
+PROPOSAL_IS_OPEN = "PROPOSAL_IS_OPEN"
+PROPOSAL_HAS_FUNDERS = "PROPOSAL_HAS_FUNDERS"
+
+# Delimiter for list fields (HUB_IDS, AUTHOR_IDS)
+DELIMITER = "|"
+
+# Limits for fields to prevent data bloat
+MAX_HUB_IDS = 20
+
+# CSV Headers (in order for the CSV file)
+CSV_HEADERS = [
+    ITEM_ID,
+    ITEM_TYPE,
+    HUB_L1,
+    HUB_L2,
+    HUB_IDS,
+    CREATION_TIMESTAMP,
+    TEXT,
+    TITLE,
+    UPVOTE_SCORE,
+    BLUESKY_COUNT_TOTAL,
+    TWEET_COUNT_TOTAL,
+    CITATION_COUNT_TOTAL,
+    PEER_REVIEW_COUNT_TOTAL,
+    HAS_ACTIVE_BOUNTY,
+    BOUNTY_HAS_SOLUTIONS,
+    RFP_IS_OPEN,
+    RFP_HAS_APPLICANTS,
+    PROPOSAL_IS_OPEN,
+    PROPOSAL_HAS_FUNDERS,
+]
+
+# Default values for each field
+FIELD_DEFAULTS = {
+    # String/ID fields (nullable)
+    ITEM_ID: None,
+    ITEM_TYPE: None,
+    HUB_L1: None,
+    HUB_L2: None,
+    HUB_IDS: None,
+    CREATION_TIMESTAMP: None,
+    TEXT: None,
+    TITLE: None,
+    # Integer fields (counts, scores)
+    UPVOTE_SCORE: 0,
+    BLUESKY_COUNT_TOTAL: 0,
+    TWEET_COUNT_TOTAL: 0,
+    CITATION_COUNT_TOTAL: 0,
+    PEER_REVIEW_COUNT_TOTAL: 0,
+    # Boolean fields (flags)
+    HAS_ACTIVE_BOUNTY: False,
+    BOUNTY_HAS_SOLUTIONS: False,
+    RFP_IS_OPEN: False,
+    RFP_HAS_APPLICANTS: False,
+    PROPOSAL_IS_OPEN: False,
+    PROPOSAL_HAS_FUNDERS: False,
+}
+
+# Document types to exclude from export
+EXCLUDED_DOCUMENT_TYPES = ["NOTE", "HYPOTHESIS"]
+
+# Document types to include in export
+SUPPORTED_DOCUMENT_TYPES = [
+    "GRANT",
+    "PREREGISTRATION",
+    "DISCUSSION",
+    "QUESTION",
+    "PAPER",
+]
+
+# Text field maximum length (to prevent CSV cell overflow)
+# Applied to both TITLE and TEXT fields
+MAX_TEXT_LENGTH = 950
+
+# ITEM_TYPE mapping for Personalize export
+# Maps internal document_type to Personalize-friendly type names
+ITEM_TYPE_MAPPING = {
+    "PREREGISTRATION": "PROPOSAL",
+    "GRANT": "RFP",
+    "DISCUSSION": "POST",
+    "QUESTION": "QUESTION",
+    "PAPER": "PAPER",
+}
@@ -17,11 +17,27 @@ def map_from_upvote(vote: Vote) -> UserInteractions:
 
     Returns:
         UserInteractions instance (not saved to database)
+
+    Raises:
+        ValueError: If vote is missing required fields (user, unified_document)
     """
+    # Validate required fields
+    if not vote.created_by_id:
+        raise ValueError(f"Vote {vote.id} has no created_by user")
+
+    # Get unified_document (this is a property that can raise exceptions)
+    try:
+        unified_doc = vote.unified_document
+    except Exception as e:
+        raise ValueError(f"Vote {vote.id} has no valid unified_document: {str(e)}")
+
+    if not unified_doc:
+        raise ValueError(f"Vote {vote.id} has None unified_document")
+
     return UserInteractions(
         user=vote.created_by,
         event=UPVOTE,
-        unified_document=vote.unified_document,
+        unified_document=unified_doc,
         content_type=vote.content_type,
         object_id=vote.object_id,
         event_timestamp=vote.created_date,
 
@@ -0,0 +1 @@
+# Items package for AWS Personalize item export
@@ -0,0 +1,195 @@
+"""
+Mapper class for converting ResearchhubUnifiedDocument to AWS Personalize items.
+"""
+
+from typing import Dict, Optional, Protocol, runtime_checkable
+
+from analytics.constants.personalize_constants import (
+    BLUESKY_COUNT_TOTAL,
+    BOUNTY_HAS_SOLUTIONS,
+    CITATION_COUNT_TOTAL,
+    CREATION_TIMESTAMP,
+    DELIMITER,
+    FIELD_DEFAULTS,
+    HAS_ACTIVE_BOUNTY,
+    HUB_IDS,
+    HUB_L1,
+    HUB_L2,
+    ITEM_ID,
+    ITEM_TYPE,
+    ITEM_TYPE_MAPPING,
+    PEER_REVIEW_COUNT_TOTAL,
+    PROPOSAL_HAS_FUNDERS,
+    PROPOSAL_IS_OPEN,
+    RFP_HAS_APPLICANTS,
+    RFP_IS_OPEN,
+    TEXT,
+    TITLE,
+    TWEET_COUNT_TOTAL,
+    UPVOTE_SCORE,
+)
+from analytics.utils.personalize_item_utils import prepare_text_for_personalize
+from utils.time import datetime_to_epoch_seconds
+
+
+@runtime_checkable
+class PrefetchedUnifiedDocument(Protocol):
+    """
+    UnifiedDocument with required prefetched relations.
+
+    Required prefetch_related:
+    - hubs
+    - fundraises, related_bounties, grants
+    """
+
+    id: int
+    document_type: str
+    score: int
+
+
+class PersonalizeItemMapper:
+    """Mapper for converting ResearchHub documents to Personalize items."""
+
+    def map_to_item(
+        self,
+        prefetched_doc: PrefetchedUnifiedDocument,
+        bounty_data: dict,
+        proposal_data: dict,
+        rfp_data: dict,
+        review_count_data: dict,
+    ) -> Dict[str, Optional[str]]:
+        """
+        Map a prefetched ResearchhubUnifiedDocument to a Personalize item dictionary.
+
+        Args:
+            prefetched_doc: UnifiedDocument with prefetched relations
+            bounty_data: Dict with has_active_bounty and has_solutions flags
+            proposal_data: Dict with is_open and has_funders flags
+            rfp_data: Dict with is_open and has_applicants flags
+            review_count_data: Dict mapping doc_id to review count
+
+        Returns:
+            Dictionary with keys matching CSV_HEADERS
+        """
+        # Initialize row with default values from constants
+        row = {field: default for field, default in FIELD_DEFAULTS.items()}
+
+        # Get the concrete document from prefetched data (avoids N+1 queries)
+        if prefetched_doc.document_type == "PAPER":
+            # For papers, use select_related paper (no query)
+            document = prefetched_doc.paper
+            if not document:
+                raise ValueError(f"Paper not found for unified_doc {prefetched_doc.id}")
+        else:
+            # For posts, get from prefetched posts (no query)
+            # Access the prefetch cache directly to avoid posts.first() query
+            posts = prefetched_doc.posts.all()
+            if not posts:
+                raise ValueError(f"Post not found for unified_doc {prefetched_doc.id}")
+            document = posts[0]  # Get first from cached list
+
+        # Map common fields
+        row.update(self._map_common_fields(prefetched_doc, document))
+
+        # Map document-type-specific fields
+        if prefetched_doc.document_type == "PAPER":
+            row.update(self._map_paper_fields(prefetched_doc, document))
+        else:
+            row.update(self._map_post_fields(prefetched_doc, document))
+
+        # Add batch-fetched metrics
+        row.update(
+            {
+                HAS_ACTIVE_BOUNTY: bounty_data.get("has_active_bounty", False),
+                BOUNTY_HAS_SOLUTIONS: bounty_data.get("has_solutions", False),
+                PROPOSAL_IS_OPEN: proposal_data.get("is_open", False),
+                PROPOSAL_HAS_FUNDERS: proposal_data.get("has_funders", False),
+                RFP_IS_OPEN: rfp_data.get("is_open", False),
+                RFP_HAS_APPLICANTS: rfp_data.get("has_applicants", False),
+                PEER_REVIEW_COUNT_TOTAL: review_count_data.get(prefetched_doc.id, 0),
+            }
+        )
+
+        return row
+
+    def _map_common_fields(
+        self, prefetched_doc: PrefetchedUnifiedDocument, document
+    ) -> dict:
+        """Map fields common to all document types using prefetched data."""
+        from hub.models import Hub
+
+        # Timestamp
+        if (
+            prefetched_doc.document_type == "PAPER"
+            and hasattr(document, "paper_publish_date")
+            and document.paper_publish_date
+        ):
+            timestamp = datetime_to_epoch_seconds(document.paper_publish_date)
+        else:
+            timestamp = datetime_to_epoch_seconds(prefetched_doc.created_date)
+
+        # Hub processing
+        from analytics.constants.personalize_constants import MAX_HUB_IDS
+
+        hub_ids = []
+        hub_l1 = None
+        hub_l2 = None
+
+        for hub in list(prefetched_doc.hubs.all())[:MAX_HUB_IDS]:
+            hub_ids.append(str(hub.id))
+            if hub.namespace == Hub.Namespace.CATEGORY:
+                hub_l1 = str(hub.id)
+            elif hub.namespace == Hub.Namespace.SUBCATEGORY:
+                hub_l2 = str(hub.id)
+
+        return {
+            ITEM_ID: str(prefetched_doc.id),
+            ITEM_TYPE: ITEM_TYPE_MAPPING.get(
+                prefetched_doc.document_type, prefetched_doc.document_type
+            ),
+            CREATION_TIMESTAMP: timestamp,
+            UPVOTE_SCORE: (
+                prefetched_doc.score if prefetched_doc.score is not None else 0
+            ),
+            HUB_L1: hub_l1,
+            HUB_L2: hub_l2,
+            HUB_IDS: DELIMITER.join(hub_ids) if hub_ids else None,
+        }
+
+    def _map_paper_fields(
+        self, prefetched_doc: PrefetchedUnifiedDocument, paper
+    ) -> dict:
+        """Map paper-specific fields."""
+        title = paper.paper_title or paper.title or ""
+        abstract = paper.abstract or ""
+        # Build hub names from prefetched hubs to avoid query
+        hub_names = ",".join(hub.name for hub in prefetched_doc.hubs.all())
+
+        text_concat = f"{title} {abstract} {hub_names}"
+
+        fields = {
+            TITLE: prepare_text_for_personalize(title),
+            TEXT: prepare_text_for_personalize(text_concat),
+            CITATION_COUNT_TOTAL: paper.citations if paper.citations is not None else 0,
+        }
+
+        if paper.external_metadata:
+            metrics = paper.external_metadata.get("metrics", {})
+            fields[BLUESKY_COUNT_TOTAL] = metrics.get("bluesky_count", 0)
+            fields[TWEET_COUNT_TOTAL] = metrics.get("twitter_count", 0)
+
+        return fields
+
+    def _map_post_fields(self, prefetched_doc: PrefetchedUnifiedDocument, post) -> dict:
+        """Map post-specific fields."""
+        title = post.title or ""
+        renderable_text = post.renderable_text or ""
+        # Build hub names from prefetched hubs to avoid query
+        hub_names = ",".join(hub.name for hub in prefetched_doc.hubs.all())
+
+        text_concat = f"{title} {renderable_text} {hub_names}"
+
+        return {
+            TITLE: prepare_text_for_personalize(title),
+            TEXT: prepare_text_for_personalize(text_concat),
+        }
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+# Items package for AWS Personalize item export`