webrecorder · ikreymer · Mar 14, 2025 · Mar 14, 2025 · May 7, 2025 · May 7, 2025
diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py
@@ -705,6 +705,8 @@ async def update_collection_counts_and_tags(self, collection_id: UUID):
 
         unique_page_count = await self.page_ops.get_unique_page_count(crawl_ids)
 
+        top_page_hosts = await self.page_ops.get_top_page_hosts(crawl_ids)
+
         await self.collections.find_one_and_update(
             {"_id": collection_id},
             {
@@ -715,6 +717,7 @@ async def update_collection_counts_and_tags(self, collection_id: UUID):
                     "totalSize": total_size,
                     "tags": sorted_tags,
                     "preloadResources": preload_resources,
+                    "topPageHosts": top_page_hosts,
                 }
             },
         )

diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py
@@ -32,7 +32,7 @@
     ) = PageOps = BackgroundJobOps = object
 
 
-CURR_DB_VERSION = "0043"
+CURR_DB_VERSION = "0044"
 
 
 # ============================================================================

diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py
@@ -1417,6 +1417,14 @@ class PreloadResource(BaseModel):
     crawlId: str
 
 
+# ============================================================================
+class HostCount(BaseModel):
+    """Host Count"""
+
+    host: str
+    count: int
+
+
 # ============================================================================
 class Collection(BaseMongoModel):
     """Org collection structure"""
@@ -1515,6 +1523,8 @@ class CollOut(BaseMongoModel):
     pagesQueryUrl: str = ""
     downloadUrl: Optional[str] = None
 
+    topPageHosts: List[HostCount] = []
+
 
 # ============================================================================
 class PublicCollOut(BaseMongoModel):
@@ -1550,6 +1560,8 @@ class PublicCollOut(BaseMongoModel):
 
     allowPublicDownload: bool = True
 
+    topPageHosts: List[HostCount] = []
+
 
 # ============================================================================
 class UpdateColl(BaseModel):

diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py
@@ -923,6 +923,35 @@ async def get_unique_page_count(self, crawl_ids: List[str]) -> int:
         res = await cursor.to_list(1)
         return res[0].get("urls") if res else 0
 
+    async def get_top_page_hosts(
+        self, crawl_ids: List[str]
+    ) -> List[dict[str, str | int]]:
+        """Get count of top page hosts across all archived items"""
+        cursor = self.pages.aggregate(
+            [
+                {"$match": {"crawl_id": {"$in": crawl_ids}}},
+                {
+                    "$addFields": {
+                        "host": {
+                            "$regexFind": {
+                                "input": "$url",
+                                "regex": "^https?://([^/]+)",
+                            }
+                        }
+                    }
+                },
+                {
+                    "$group": {
+                        "_id": {"$first": "$host.captures"},
+                        "count": {"$count": {}},
+                    }
+                },
+                {"$sort": {"count": -1}},
+            ]
+        )
+        res = await cursor.to_list(10)
+        return [{"host": x.get("_id"), "count": x.get("count")} for x in res]
+
     async def set_archived_item_page_counts(self, crawl_id: str):
         """Store archived item page and unique page counts in crawl document"""
         page_count = await self.pages.count_documents({"crawl_id": crawl_id})

diff --git a/frontend/src/layouts/collections/metadataColumn.ts b/frontend/src/layouts/collections/metadataColumn.ts
@@ -56,6 +56,20 @@ export function metadataColumn(collection?: Collection | PublicCollection) {
         label: metadata.totalSize,
         render: (col) => `${localize.bytes(col.totalSize)}`,
       })}
+      ${metadataItem({
+        label: metadata.topPageHosts,
+        render: (col) =>
+          html` <table>
+            ${col.topPageHosts.map(
+              (x) => html`
+                <tr>
+                  <td>${x.host}</td>
+                  <td class="pl-4">${x.count}</td>
+                </tr>
+              `,
+            )}
+          </table>`,
+      })}
     </btrix-desc-list>
   `;
 }
diff --git a/frontend/src/strings/collections/metadata.ts b/frontend/src/strings/collections/metadata.ts
@@ -5,4 +5,5 @@ export const metadata = {
   uniquePageCount: msg("Unique Pages in Collection"),
   pageCount: msg("Total Pages Crawled"),
   totalSize: msg("Collection Size"),
+  topPageHosts: msg("Top Page Hostnames"),
 };
diff --git a/frontend/src/types/collection.ts b/frontend/src/types/collection.ts
@@ -41,6 +41,12 @@ export const publicCollectionSchema = z.object({
   crawlCount: z.number(),
   uniquePageCount: z.number(),
   pageCount: z.number(),
+  topPageHosts: z.array(
+    z.object({
+      host: z.string(),
+      count: z.number(),
+    }),
+  ),
   totalSize: z.number(),
   allowPublicDownload: z.boolean(),
   homeUrl: z.string().url().nullable(),