Skip to content

Commit 7c3f02f

Browse files
authored
Merge pull request #2740 from ResearchHub/feat/item-export
[Personalize] Item export for AWS Personalize
2 parents df9363c + f03f152 commit 7c3f02f

21 files changed

+3694
-9
lines changed
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
"""
2+
Constants for AWS Personalize item data export.
3+
4+
Defines field names, CSV headers, and data types matching the Avro schema.
5+
"""
6+
7+
# CSV Field Names (matching Avro schema)
8+
ITEM_ID = "ITEM_ID"
9+
ITEM_TYPE = "ITEM_TYPE"
10+
HUB_L1 = "HUB_L1"
11+
HUB_L2 = "HUB_L2"
12+
HUB_IDS = "HUB_IDS"
13+
AUTHOR_IDS = "AUTHOR_IDS"
14+
CREATION_TIMESTAMP = "CREATION_TIMESTAMP"
15+
TEXT = "TEXT"
16+
TITLE = "TITLE"
17+
UPVOTE_SCORE = "UPVOTE_SCORE"
18+
BLUESKY_COUNT_TOTAL = "BLUESKY_COUNT_TOTAL"
19+
TWEET_COUNT_TOTAL = "TWEET_COUNT_TOTAL"
20+
CITATION_COUNT_TOTAL = "CITATION_COUNT_TOTAL"
21+
PEER_REVIEW_COUNT_TOTAL = "PEER_REVIEW_COUNT_TOTAL"
22+
HAS_ACTIVE_BOUNTY = "HAS_ACTIVE_BOUNTY"
23+
BOUNTY_HAS_SOLUTIONS = "BOUNTY_HAS_SOLUTIONS"
24+
RFP_IS_OPEN = "RFP_IS_OPEN"
25+
RFP_HAS_APPLICANTS = "RFP_HAS_APPLICANTS"
26+
PROPOSAL_IS_OPEN = "PROPOSAL_IS_OPEN"
27+
PROPOSAL_HAS_FUNDERS = "PROPOSAL_HAS_FUNDERS"
28+
29+
# Delimiter for list fields (HUB_IDS, AUTHOR_IDS)
30+
DELIMITER = "|"
31+
32+
# Limits for fields to prevent data bloat
33+
MAX_HUB_IDS = 20
34+
35+
# CSV Headers (in order for the CSV file)
36+
CSV_HEADERS = [
37+
ITEM_ID,
38+
ITEM_TYPE,
39+
HUB_L1,
40+
HUB_L2,
41+
HUB_IDS,
42+
CREATION_TIMESTAMP,
43+
TEXT,
44+
TITLE,
45+
UPVOTE_SCORE,
46+
BLUESKY_COUNT_TOTAL,
47+
TWEET_COUNT_TOTAL,
48+
CITATION_COUNT_TOTAL,
49+
PEER_REVIEW_COUNT_TOTAL,
50+
HAS_ACTIVE_BOUNTY,
51+
BOUNTY_HAS_SOLUTIONS,
52+
RFP_IS_OPEN,
53+
RFP_HAS_APPLICANTS,
54+
PROPOSAL_IS_OPEN,
55+
PROPOSAL_HAS_FUNDERS,
56+
]
57+
58+
# Default values for each field
59+
FIELD_DEFAULTS = {
60+
# String/ID fields (nullable)
61+
ITEM_ID: None,
62+
ITEM_TYPE: None,
63+
HUB_L1: None,
64+
HUB_L2: None,
65+
HUB_IDS: None,
66+
CREATION_TIMESTAMP: None,
67+
TEXT: None,
68+
TITLE: None,
69+
# Integer fields (counts, scores)
70+
UPVOTE_SCORE: 0,
71+
BLUESKY_COUNT_TOTAL: 0,
72+
TWEET_COUNT_TOTAL: 0,
73+
CITATION_COUNT_TOTAL: 0,
74+
PEER_REVIEW_COUNT_TOTAL: 0,
75+
# Boolean fields (flags)
76+
HAS_ACTIVE_BOUNTY: False,
77+
BOUNTY_HAS_SOLUTIONS: False,
78+
RFP_IS_OPEN: False,
79+
RFP_HAS_APPLICANTS: False,
80+
PROPOSAL_IS_OPEN: False,
81+
PROPOSAL_HAS_FUNDERS: False,
82+
}
83+
84+
# Document types to exclude from export
85+
EXCLUDED_DOCUMENT_TYPES = ["NOTE", "HYPOTHESIS"]
86+
87+
# Document types to include in export
88+
SUPPORTED_DOCUMENT_TYPES = [
89+
"GRANT",
90+
"PREREGISTRATION",
91+
"DISCUSSION",
92+
"QUESTION",
93+
"PAPER",
94+
]
95+
96+
# Text field maximum length (to prevent CSV cell overflow)
97+
# Applied to both TITLE and TEXT fields
98+
MAX_TEXT_LENGTH = 950
99+
100+
# ITEM_TYPE mapping for Personalize export
101+
# Maps internal document_type to Personalize-friendly type names
102+
ITEM_TYPE_MAPPING = {
103+
"PREREGISTRATION": "PROPOSAL",
104+
"GRANT": "RFP",
105+
"DISCUSSION": "POST",
106+
"QUESTION": "QUESTION",
107+
"PAPER": "PAPER",
108+
}

src/analytics/interactions/interaction_mapper.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,27 @@ def map_from_upvote(vote: Vote) -> UserInteractions:
1717
1818
Returns:
1919
UserInteractions instance (not saved to database)
20+
21+
Raises:
22+
ValueError: If vote is missing required fields (user, unified_document)
2023
"""
24+
# Validate required fields
25+
if not vote.created_by_id:
26+
raise ValueError(f"Vote {vote.id} has no created_by user")
27+
28+
# Get unified_document (this is a property that can raise exceptions)
29+
try:
30+
unified_doc = vote.unified_document
31+
except Exception as e:
32+
raise ValueError(f"Vote {vote.id} has no valid unified_document: {str(e)}")
33+
34+
if not unified_doc:
35+
raise ValueError(f"Vote {vote.id} has None unified_document")
36+
2137
return UserInteractions(
2238
user=vote.created_by,
2339
event=UPVOTE,
24-
unified_document=vote.unified_document,
40+
unified_document=unified_doc,
2541
content_type=vote.content_type,
2642
object_id=vote.object_id,
2743
event_timestamp=vote.created_date,

src/analytics/items/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# Items package for AWS Personalize item export
Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
"""
2+
Mapper class for converting ResearchhubUnifiedDocument to AWS Personalize items.
3+
"""
4+
5+
from typing import Dict, Optional, Protocol, runtime_checkable
6+
7+
from analytics.constants.personalize_constants import (
8+
BLUESKY_COUNT_TOTAL,
9+
BOUNTY_HAS_SOLUTIONS,
10+
CITATION_COUNT_TOTAL,
11+
CREATION_TIMESTAMP,
12+
DELIMITER,
13+
FIELD_DEFAULTS,
14+
HAS_ACTIVE_BOUNTY,
15+
HUB_IDS,
16+
HUB_L1,
17+
HUB_L2,
18+
ITEM_ID,
19+
ITEM_TYPE,
20+
ITEM_TYPE_MAPPING,
21+
PEER_REVIEW_COUNT_TOTAL,
22+
PROPOSAL_HAS_FUNDERS,
23+
PROPOSAL_IS_OPEN,
24+
RFP_HAS_APPLICANTS,
25+
RFP_IS_OPEN,
26+
TEXT,
27+
TITLE,
28+
TWEET_COUNT_TOTAL,
29+
UPVOTE_SCORE,
30+
)
31+
from analytics.utils.personalize_item_utils import prepare_text_for_personalize
32+
from utils.time import datetime_to_epoch_seconds
33+
34+
35+
@runtime_checkable
36+
class PrefetchedUnifiedDocument(Protocol):
37+
"""
38+
UnifiedDocument with required prefetched relations.
39+
40+
Required prefetch_related:
41+
- hubs
42+
- fundraises, related_bounties, grants
43+
"""
44+
45+
id: int
46+
document_type: str
47+
score: int
48+
49+
50+
class PersonalizeItemMapper:
51+
"""Mapper for converting ResearchHub documents to Personalize items."""
52+
53+
def map_to_item(
54+
self,
55+
prefetched_doc: PrefetchedUnifiedDocument,
56+
bounty_data: dict,
57+
proposal_data: dict,
58+
rfp_data: dict,
59+
review_count_data: dict,
60+
) -> Dict[str, Optional[str]]:
61+
"""
62+
Map a prefetched ResearchhubUnifiedDocument to a Personalize item dictionary.
63+
64+
Args:
65+
prefetched_doc: UnifiedDocument with prefetched relations
66+
bounty_data: Dict with has_active_bounty and has_solutions flags
67+
proposal_data: Dict with is_open and has_funders flags
68+
rfp_data: Dict with is_open and has_applicants flags
69+
review_count_data: Dict mapping doc_id to review count
70+
71+
Returns:
72+
Dictionary with keys matching CSV_HEADERS
73+
"""
74+
# Initialize row with default values from constants
75+
row = {field: default for field, default in FIELD_DEFAULTS.items()}
76+
77+
# Get the concrete document from prefetched data (avoids N+1 queries)
78+
if prefetched_doc.document_type == "PAPER":
79+
# For papers, use select_related paper (no query)
80+
document = prefetched_doc.paper
81+
if not document:
82+
raise ValueError(f"Paper not found for unified_doc {prefetched_doc.id}")
83+
else:
84+
# For posts, get from prefetched posts (no query)
85+
# Access the prefetch cache directly to avoid posts.first() query
86+
posts = prefetched_doc.posts.all()
87+
if not posts:
88+
raise ValueError(f"Post not found for unified_doc {prefetched_doc.id}")
89+
document = posts[0] # Get first from cached list
90+
91+
# Map common fields
92+
row.update(self._map_common_fields(prefetched_doc, document))
93+
94+
# Map document-type-specific fields
95+
if prefetched_doc.document_type == "PAPER":
96+
row.update(self._map_paper_fields(prefetched_doc, document))
97+
else:
98+
row.update(self._map_post_fields(prefetched_doc, document))
99+
100+
# Add batch-fetched metrics
101+
row.update(
102+
{
103+
HAS_ACTIVE_BOUNTY: bounty_data.get("has_active_bounty", False),
104+
BOUNTY_HAS_SOLUTIONS: bounty_data.get("has_solutions", False),
105+
PROPOSAL_IS_OPEN: proposal_data.get("is_open", False),
106+
PROPOSAL_HAS_FUNDERS: proposal_data.get("has_funders", False),
107+
RFP_IS_OPEN: rfp_data.get("is_open", False),
108+
RFP_HAS_APPLICANTS: rfp_data.get("has_applicants", False),
109+
PEER_REVIEW_COUNT_TOTAL: review_count_data.get(prefetched_doc.id, 0),
110+
}
111+
)
112+
113+
return row
114+
115+
def _map_common_fields(
116+
self, prefetched_doc: PrefetchedUnifiedDocument, document
117+
) -> dict:
118+
"""Map fields common to all document types using prefetched data."""
119+
from hub.models import Hub
120+
121+
# Timestamp
122+
if (
123+
prefetched_doc.document_type == "PAPER"
124+
and hasattr(document, "paper_publish_date")
125+
and document.paper_publish_date
126+
):
127+
timestamp = datetime_to_epoch_seconds(document.paper_publish_date)
128+
else:
129+
timestamp = datetime_to_epoch_seconds(prefetched_doc.created_date)
130+
131+
# Hub processing
132+
from analytics.constants.personalize_constants import MAX_HUB_IDS
133+
134+
hub_ids = []
135+
hub_l1 = None
136+
hub_l2 = None
137+
138+
for hub in list(prefetched_doc.hubs.all())[:MAX_HUB_IDS]:
139+
hub_ids.append(str(hub.id))
140+
if hub.namespace == Hub.Namespace.CATEGORY:
141+
hub_l1 = str(hub.id)
142+
elif hub.namespace == Hub.Namespace.SUBCATEGORY:
143+
hub_l2 = str(hub.id)
144+
145+
return {
146+
ITEM_ID: str(prefetched_doc.id),
147+
ITEM_TYPE: ITEM_TYPE_MAPPING.get(
148+
prefetched_doc.document_type, prefetched_doc.document_type
149+
),
150+
CREATION_TIMESTAMP: timestamp,
151+
UPVOTE_SCORE: (
152+
prefetched_doc.score if prefetched_doc.score is not None else 0
153+
),
154+
HUB_L1: hub_l1,
155+
HUB_L2: hub_l2,
156+
HUB_IDS: DELIMITER.join(hub_ids) if hub_ids else None,
157+
}
158+
159+
def _map_paper_fields(
160+
self, prefetched_doc: PrefetchedUnifiedDocument, paper
161+
) -> dict:
162+
"""Map paper-specific fields."""
163+
title = paper.paper_title or paper.title or ""
164+
abstract = paper.abstract or ""
165+
# Build hub names from prefetched hubs to avoid query
166+
hub_names = ",".join(hub.name for hub in prefetched_doc.hubs.all())
167+
168+
text_concat = f"{title} {abstract} {hub_names}"
169+
170+
fields = {
171+
TITLE: prepare_text_for_personalize(title),
172+
TEXT: prepare_text_for_personalize(text_concat),
173+
CITATION_COUNT_TOTAL: paper.citations if paper.citations is not None else 0,
174+
}
175+
176+
if paper.external_metadata:
177+
metrics = paper.external_metadata.get("metrics", {})
178+
fields[BLUESKY_COUNT_TOTAL] = metrics.get("bluesky_count", 0)
179+
fields[TWEET_COUNT_TOTAL] = metrics.get("twitter_count", 0)
180+
181+
return fields
182+
183+
def _map_post_fields(self, prefetched_doc: PrefetchedUnifiedDocument, post) -> dict:
184+
"""Map post-specific fields."""
185+
title = post.title or ""
186+
renderable_text = post.renderable_text or ""
187+
# Build hub names from prefetched hubs to avoid query
188+
hub_names = ",".join(hub.name for hub in prefetched_doc.hubs.all())
189+
190+
text_concat = f"{title} {renderable_text} {hub_names}"
191+
192+
return {
193+
TITLE: prepare_text_for_personalize(title),
194+
TEXT: prepare_text_for_personalize(text_concat),
195+
}

0 commit comments

Comments
 (0)