apify · axmanalad · Jul 24, 2025 · Jul 24, 2025 · Jul 24, 2025 · Jul 24, 2025
diff --git a/docs/introduction/03-adding-urls.mdx b/docs/introduction/03-adding-urls.mdx
@@ -114,6 +114,43 @@ await enqueueLinks({
 
 When you run the code, you will see the crawler  log the **title** of the first page, then the **enqueueing** message showing number of URLs, followed by the **title** of the first enqueued page and so on and so on.
 
+### Controlling subdomain access
+
+When using the `same-domain` strategy, you can further control which subdomains are allowed using the `allowedSubdomains` option. By default with `same-domain`, all subdomains are included, but you can specify only certain subdomains:
+
+```ts
+// Allow only specific subdomains with same-domain strategy
+await enqueueLinks({
+    strategy: 'same-domain',
+    allowedSubdomains: ['blog', 'shop', 'support']
+});
+
+// Allow all subdomains (default same-domain behavior)
+await enqueueLinks({
+    strategy: 'same-domain',
+    allowedSubdomains: ['*']
+});
+
+// Allow only the main domain (no subdomains) with same-domain strategy
+await enqueueLinks({
+    strategy: 'same-domain',
+    allowedSubdomains: ['']
+});
+```
+
+For example, if your base URL is `https://example.com` and you use `strategy: 'same-domain'` with `allowedSubdomains: ['blog', 'shop']`, the crawler will enqueue links from:
+- `https://example.com` (main domain)
+- `https://blog.example.com` (allowed subdomain)
+- `https://shop.example.com` (allowed subdomain)
+
+But it will **not** enqueue links from:
+- `https://api.example.com` (subdomain not in allowed list)
+- `https://docs.example.com` (subdomain not in allowed list)
+
+:::note
+The `allowedSubdomains` option only works when you explicitly set `strategy: 'same-domain'`. It has no effect with the default `same-hostname` strategy or other strategies.
+:::
+
 ## Skipping duplicate URLs
 
 Skipping of duplicate URLs is critical, because visiting the same page multiple times would lead to duplicate results. This is automatically handled by the `RequestQueue` which deduplicates requests using their `uniqueKey`. This `uniqueKey` is automatically generated from the request's URL by lowercasing the URL, lexically ordering query parameters, removing fragments and a few other tweaks that ensure the queue only includes unique URLs.
@@ -144,6 +181,16 @@ await enqueueLinks({
 });
 ```
 
+You can also combine the `same-domain` strategy with `allowedSubdomains` for precise control:
+
+```ts
+await enqueueLinks({
+    strategy: 'same-domain',
+    allowedSubdomains: ['www', 'mobile'], // Only allow www and mobile subdomains
+    globs: ['http?(s)://**.example.com/products/*'] // Additional URL filtering
+});
+```
+
 ### Transform requests
 
 To have absolute control, we have the <ApiLink to="core/interface/EnqueueLinksOptions/#transformRequestFunction">`transformRequestFunction`</ApiLink>. Just before a new <ApiLink to="core/class/Request">`Request`</ApiLink> is constructed and enqueued to the <ApiLink to="core/class/RequestQueue">`RequestQueue`</ApiLink>, this function can be used to skip it or modify its contents such as `userData`, `payload` or, most importantly, `uniqueKey`. This is useful when you need to enqueue multiple requests to the queue, and these requests share the same URL, but differ in methods or payloads. Another use case is to dynamically update or create the `userData`.

diff --git a/packages/core/src/enqueue_links/enqueue_links.ts b/packages/core/src/enqueue_links/enqueue_links.ts
@@ -1,7 +1,7 @@
 import type { BatchAddRequestsResult, Dictionary } from '@crawlee/types';
 import { type RobotsTxtFile } from '@crawlee/utils';
 import ow from 'ow';
-import { getDomain } from 'tldts';
+import { getDomain, getSubdomain } from 'tldts';
 import type { SetRequired } from 'type-fest';
 
 import log from '@apify/log';
@@ -67,6 +67,14 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
      */
     baseUrl?: string;
 
+    /**
+     * An array of allowed subdomains to be used for matching URLs.
+     *
+     * Note that this option is only used when the `strategy` is set to `same-domain`.
+     * @default ["*"]
+     */
+    allowedSubdomains?: readonly string[];
+
     /**
      * An array of glob pattern strings or plain objects
      * containing glob pattern strings matching the URLs to be enqueued.
@@ -156,8 +164,8 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
      * Depending on the strategy you select, we will only check certain parts of the URLs found. Here is a diagram of each URL part and their name:
      *
      * ```md
-     * Protocol          Domain
-     * ┌────┐          ┌─────────┐
+     * Protocol  Sub     Domain
+     * ┌────┐  ┌─────┐ ┌─────────┐
      * https://example.crawlee.dev/...
      * │       └─────────────────┤
      * │             Hostname    │
@@ -198,8 +206,8 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
  * Depending on the strategy you select, we will only check certain parts of the URLs found. Here is a diagram of each URL part and their name:
  *
  * ```md
- * Protocol          Domain
- * ┌────┐          ┌─────────┐
+ * Protocol  Sub     Domain
+ * ┌────┐  ┌─────┐ ┌─────────┐
  * https://example.crawlee.dev/...
  * │       └─────────────────┤
  * │             Hostname    │
@@ -233,6 +241,11 @@ export enum EnqueueStrategy {
      * For example, `https://wow.an.example.com` and `https://example.com` will both be matched for a base url of
      * `https://example.com`.
      *
+     * Also matches filtered subdomains if `allowedSubdomains` is provided instead of any subdomain.
+     * For example, if `allowedSubdomains` is set to `['wow', 'nice']` and the base URL is `https://example.com`, then
+     * `https://wow.example.com`, `https://nice.example.com`, and `https://example.com` will be matched, but
+     * `https://bar.example.com` will not.
+     *
      * > This strategy will match both `http` and `https` protocols regardless of the base URL protocol.
      */
     SameDomain = 'same-domain',
@@ -313,6 +326,7 @@ export async function enqueueLinks(
             transformRequestFunction: ow.optional.function,
             strategy: ow.optional.string.oneOf(Object.values(EnqueueStrategy)),
             waitForAllRequestsToBeAdded: ow.optional.boolean,
+            allowedSubdomains: ow.optional.array.ofType(ow.string),
         }),
     );
 
@@ -329,6 +343,7 @@ export async function enqueueLinks(
         waitForAllRequestsToBeAdded,
         robotsTxtFile,
         onSkippedRequest,
+        allowedSubdomains,
     } = options;
 
     const urlExcludePatternObjects: UrlPatternObject[] = [];
@@ -374,18 +389,39 @@ export async function enqueueLinks(
                 enqueueStrategyPatterns.push({ glob: ignoreHttpSchema(`${url.origin}/**`) });
                 break;
             case EnqueueStrategy.SameDomain: {
-                // Get the actual hostname from the base url
-                const baseUrlHostname = getDomain(url.hostname, { mixedInputs: false });
-
-                if (baseUrlHostname) {
-                    // We have a hostname, so we can use it to match all links on the page that point to it and any subdomains of it
-                    url.hostname = baseUrlHostname;
-                    enqueueStrategyPatterns.push(
-                        { glob: ignoreHttpSchema(`${url.origin.replace(baseUrlHostname, `*.${baseUrlHostname}`)}/**`) },
-                        { glob: ignoreHttpSchema(`${url.origin}/**`) },
-                    );
+                // Get the actual domain and subdomain from the base url
+                const baseUrlDomain = getDomain(url.hostname, { mixedInputs: false });
+                const baseUrlSubdomain = getSubdomain(url.hostname);
+                const subList = allowedSubdomains ?? ['*'];
+
+                if (baseUrlDomain) {
+                    // We have a domain, so we can use it to match all links on the page that point to it and any subdomains of it
+                    if (subList.includes('*') || subList.length === 0) {
+                        url.hostname = baseUrlDomain;
+                        enqueueStrategyPatterns.push(
+                            { glob: ignoreHttpSchema(`${url.origin.replace(baseUrlDomain, `*.${baseUrlDomain}`)}/**`) },
+                            { glob: ignoreHttpSchema(`${url.origin}/**`) }, // Only base URL, no subdomains
+                        );
+                    } else {
+                        // Defaults to always include subdomain of original URL if it exists.
+                        enqueueStrategyPatterns.push({ glob: ignoreHttpSchema(`${url.origin}/**`) }); // Original URL
+                        // User decides to filter by specific subdomains, so we can match them.
+                        for (const subdomain of subList) {
+                            if (subdomain && subdomain !== baseUrlSubdomain) {
+                                const filteredSubdomainUrl = new URL(url.origin);
+                                filteredSubdomainUrl.hostname = `${subdomain}.${baseUrlDomain}`;
+                                enqueueStrategyPatterns.push({
+                                    glob: ignoreHttpSchema(`${filteredSubdomainUrl.origin}/**`),
+                                });
+                            }
+                        }
+                        url.hostname = baseUrlDomain;
+                        enqueueStrategyPatterns.push(
+                            { glob: ignoreHttpSchema(`${url.origin}/**`) }, // Only base URL, no subdomains
+                        );
+                    }
                 } else {
-                    // We don't have a hostname (can happen for ips for instance), so reproduce the same behavior
+                    // We don't have a domain (can happen for ips for instance), so reproduce the same behavior
                     // as SameDomainAndSubdomain
                     enqueueStrategyPatterns.push({ glob: ignoreHttpSchema(`${url.origin}/**`) });
                 }