Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions run-task.sh
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,16 @@ elif [ "$SERVICE" = "filesystem" ]; then
$([ -f .mcp_env ] && echo "-v $(pwd)/.mcp_env:/app/.mcp_env:ro") \
"$DOCKER_IMAGE" \
python3 -m pipeline --mcp "$SERVICE" --k 1 "$@"
elif [ "$SERVICE" = "insforge" ]; then
# For Insforge service, use host network to access Insforge backend on host
docker run --rm \
--memory="$DOCKER_MEMORY_LIMIT" \
--cpus="$DOCKER_CPU_LIMIT" \
--add-host=host.docker.internal:host-gateway \
-v "$(pwd)/results:/app/results" \
$([ -f .mcp_env ] && echo "-v $(pwd)/.mcp_env:/app/.mcp_env:ro") \
"$DOCKER_IMAGE" \
python3 -m pipeline --mcp "$SERVICE" --k 1 "$@"
else
# For other services (notion, github, playwright, etc.)
docker run --rm \
Expand Down
18 changes: 16 additions & 2 deletions src/agents/base_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
class BaseMCPAgent(ABC):
"""Base class with shared functionality for MCPMark agents."""

STDIO_SERVICES = ["notion", "filesystem", "playwright", "playwright_webarena", "postgres"]
HTTP_SERVICES = ["github"]
STDIO_SERVICES = ["notion", "filesystem", "playwright", "playwright_webarena", "postgres", "insforge"]
HTTP_SERVICES = ["github", "supabase"]
DEFAULT_TIMEOUT = 600

CLAUDE_THINKING_BUDGETS = {
Expand Down Expand Up @@ -207,6 +207,20 @@ def _create_stdio_server(self) -> MCPStdioServer:
env={"DATABASE_URI": database_url},
)

if self.mcp_service == "insforge":
api_key = self.service_config.get("api_key")
backend_url = self.service_config.get("backend_url")
if not all([api_key, backend_url]):
raise ValueError("Insforge requires api_key and backend_url")
return MCPStdioServer(
command="npx",
args=["-y", "@insforge/mcp@dev"],
env={
"INSFORGE_API_KEY": api_key,
"INSFORGE_BACKEND_URL": backend_url,
},
)

raise ValueError(f"Unsupported stdio service: {self.mcp_service}")

def _create_http_server(self) -> MCPHttpServer:
Expand Down
44 changes: 39 additions & 5 deletions src/agents/mcpmark_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -844,18 +844,32 @@ def _create_stdio_server(self) -> MCPStdioServer:
username = self.service_config.get("username")
password = self.service_config.get("password")
database = self.service_config.get("current_database") or self.service_config.get("database")

if not all([username, password, database]):
raise ValueError("PostgreSQL requires username, password, and database")

database_url = f"postgresql://{username}:{password}@{host}:{port}/{database}"

return MCPStdioServer(
command="pipx",
args=["run", "postgres-mcp", "--access-mode=unrestricted"],
env={"DATABASE_URI": database_url}
)


elif self.mcp_service == "insforge":
api_key = self.service_config.get("api_key")
backend_url = self.service_config.get("backend_url")
if not all([api_key, backend_url]):
raise ValueError("Insforge requires api_key and backend_url")
return MCPStdioServer(
command="npx",
args=["-y", "@insforge/mcp@dev"],
env={
"INSFORGE_API_KEY": api_key,
"INSFORGE_BACKEND_URL": backend_url,
},
)

else:
raise ValueError(f"Unsupported stdio service: {self.mcp_service}")

Expand All @@ -866,14 +880,34 @@ def _create_http_server(self) -> MCPHttpServer:
github_token = self.service_config.get("github_token")
if not github_token:
raise ValueError("GitHub token required")

return MCPHttpServer(
url="https://api.githubcopilot.com/mcp/",
headers={
"Authorization": f"Bearer {github_token}",
"User-Agent": "MCPMark/1.0"
}
)

elif self.mcp_service == "supabase":
# Use built-in MCP server from Supabase CLI
api_url = self.service_config.get("api_url", "http://localhost:54321")
api_key = self.service_config.get("api_key", "")

if not api_key:
raise ValueError("Supabase requires api_key (use secret key from 'supabase status')")

# Supabase CLI exposes MCP at /mcp endpoint
mcp_url = f"{api_url}/mcp"

return MCPHttpServer(
url=mcp_url,
headers={
"apikey": api_key,
"Authorization": f"Bearer {api_key}",
}
)

else:
raise ValueError(f"Unsupported HTTP service: {self.mcp_service}")

22 changes: 12 additions & 10 deletions src/aggregators/aggregate_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,19 @@
def discover_tasks() -> Dict[str, List[str]]:
"""Discover all tasks from ./tasks directory."""
tasks_dir = Path("./tasks")

all_tasks = {}

# Handle each MCP service
# Note: playwright and playwright_webarena both map to "playwright" MCP
service_mappings = {
"filesystem": ["filesystem"],
"github": ["github"],
"notion": ["notion"],
"playwright": ["playwright", "playwright_webarena"], # Both count as playwright
"postgres": ["postgres"]
"postgres": ["postgres", "supabase", "insforge"] # All map to postgres
}

for mcp_service, task_dirs in service_mappings.items():
tasks = []
for task_dir_name in task_dirs:
Expand Down Expand Up @@ -68,9 +68,11 @@ def collect_results(exp_dir: Path, k: int) -> Dict[str, Dict[str, Any]]:
continue

model, service = model_service_dir.name.split("__", 1)
# Normalize service name: treat playwright_webarena as playwright
# Normalize service names
if service == "playwright_webarena":
service = "playwright"
elif service in ["supabase", "insforge"]:
service = "postgres"

for run_idx in range(1, k + 1):
run_dir = model_service_dir / f"run-{run_idx}"
Expand Down Expand Up @@ -874,23 +876,23 @@ def main():
help="Comma-separated list of models that only need run-1"
)
parser.add_argument("--push", action="store_true", help="Push to GitHub (default to main)")

args = parser.parse_args()

# Parse single-run models
single_run_models = []
if args.single_run_models:
single_run_models = [m.strip() for m in args.single_run_models.split(",")]
print(f"πŸ“Œ Single-run models: {', '.join(single_run_models)}")

# Setup paths
exp_dir = Path("./results") / args.exp_name
if not exp_dir.exists():
print(f"❌ Experiment directory {exp_dir} does not exist")
return 1

print(f"πŸ”„ Processing experiment: {args.exp_name}")

# Discover all tasks
print("πŸ“‹ Discovering tasks...")
all_tasks = discover_tasks()
Expand Down
Loading