using AngleSharp.Dom; using AngleSharp.Html.Parser; using System.Net; using System.Text.Json; namespace Sonex.Worker.WebSync; internal sealed class WebSyncCollectionLoader { private static readonly HttpClient CollectionHttpClient = new(); private const int PageSize = 96; private const int MaxPages = 500; private const string OperationCollectionParsing = "CollectionParsing"; public async Task DownloadCollectionAsync( string baseCollectionUrl, int retryCount, int downloadTimeoutSeconds, int retryDelaySeconds, string collectionOperation, WebSyncRunReport runReport, Func waitIfPaused, CancellationToken cancellationToken) { var articleNumbers = new HashSet(StringComparer.OrdinalIgnoreCase); var productUrls = new HashSet(StringComparer.OrdinalIgnoreCase); string normalizedBaseUrl = NormalizeBaseCollectionUrl(baseCollectionUrl); for (int page = 1; page <= MaxPages; page++) { cancellationToken.ThrowIfCancellationRequested(); await waitIfPaused(cancellationToken).ConfigureAwait(false); string pageUrl = BuildPageUrl(normalizedBaseUrl, page); string html = await DownloadPageHtmlWithRetryAsync( pageUrl, retryCount, downloadTimeoutSeconds, retryDelaySeconds, collectionOperation, runReport, waitIfPaused, cancellationToken).ConfigureAwait(false); CollectionPageItems pageItems = await ExtractCollectionPageItemsAsync( normalizedBaseUrl, html, waitIfPaused, cancellationToken).ConfigureAwait(false); if (pageItems.ArticleNumbers.Count == 0 && pageItems.ProductUrls.Count == 0) break; foreach (string articleNumber in pageItems.ArticleNumbers) { articleNumbers.Add(articleNumber); } foreach (string productUrl in pageItems.ProductUrls) { productUrls.Add(productUrl); } if (pageItems.ProductUrls.Count < PageSize) break; } return new CollectionScanResult { ArticleNumbers = articleNumbers, ProductUrls = productUrls }; } private static async Task DownloadPageHtmlWithRetryAsync( string pageUrl, int retryCount, int downloadTimeoutSeconds, int retryDelaySeconds, string collectionOperation, WebSyncRunReport runReport, Func waitIfPaused, CancellationToken cancellationToken) { int attempts = Math.Max(1, retryCount + 1); for (int attempt = 1; attempt <= attempts; attempt++) { cancellationToken.ThrowIfCancellationRequested(); await waitIfPaused(cancellationToken).ConfigureAwait(false); try { using var timeoutCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken); timeoutCts.CancelAfter(TimeSpan.FromSeconds(downloadTimeoutSeconds)); using HttpResponseMessage response = await CollectionHttpClient.GetAsync( pageUrl, HttpCompletionOption.ResponseHeadersRead, timeoutCts.Token).ConfigureAwait(false); response.EnsureSuccessStatusCode(); string html = await response.Content.ReadAsStringAsync(timeoutCts.Token).ConfigureAwait(false); runReport.IncrementPageDownloaded(); return html; } catch (OperationCanceledException) when (!cancellationToken.IsCancellationRequested && attempt < attempts) { await waitIfPaused(cancellationToken).ConfigureAwait(false); await Task.Delay(TimeSpan.FromSeconds(retryDelaySeconds), cancellationToken).ConfigureAwait(false); } catch (Exception) when (attempt < attempts) { await waitIfPaused(cancellationToken).ConfigureAwait(false); await Task.Delay(TimeSpan.FromSeconds(retryDelaySeconds), cancellationToken).ConfigureAwait(false); } catch (Exception ex) { string message = $"Collection page download failed for {pageUrl} after {attempts} attempts."; throw new WebSyncOperationException( collectionOperation, message, ex); } } throw new WebSyncOperationException( collectionOperation, $"Collection page download failed for {pageUrl}.", new InvalidOperationException("Unknown collection page download failure.")); } private static async Task ExtractCollectionPageItemsAsync( string baseCollectionUrl, string html, Func waitIfPaused, CancellationToken cancellationToken) { try { var parser = new HtmlParser(); IDocument document = await parser.ParseDocumentAsync(html, cancellationToken).ConfigureAwait(false); var articleNumbers = new HashSet(StringComparer.OrdinalIgnoreCase); var productUrls = new HashSet(StringComparer.OrdinalIgnoreCase); foreach (IElement productTile in document.QuerySelectorAll("a.product__tile")) { cancellationToken.ThrowIfCancellationRequested(); await waitIfPaused(cancellationToken).ConfigureAwait(false); string? href = productTile.GetAttribute("href"); if (TryExtractProductUrl(baseCollectionUrl, href, out string productUrl)) { productUrls.Add(productUrl); } string? imageUrl = productTile.QuerySelector("img[data-src], img[src]")?.GetAttribute("data-src") ?? productTile.QuerySelector("img[data-src], img[src]")?.GetAttribute("src"); if (TryExtractArticleNumberFromImageUrl(baseCollectionUrl, imageUrl, out string fromImage)) { articleNumbers.Add(fromImage); continue; } if (TryExtractArticleNumberFromHref(baseCollectionUrl, href, out string fromHref)) { articleNumbers.Add(fromHref); } } ExtractFromCatalogData(document, baseCollectionUrl, articleNumbers, productUrls); return new CollectionPageItems { ArticleNumbers = articleNumbers, ProductUrls = productUrls }; } catch (Exception ex) when (ex is not OperationCanceledException) { throw new WebSyncOperationException( OperationCollectionParsing, "Collection page parsing failed.", ex); } } private static bool TryExtractArticleNumberFromImageUrl( string baseCollectionUrl, string? rawUrl, out string articleNumber) { articleNumber = string.Empty; if (string.IsNullOrWhiteSpace(rawUrl)) return false; if (!Uri.TryCreate(baseCollectionUrl, UriKind.Absolute, out Uri? baseUri)) return false; string decoded = WebUtility.HtmlDecode(rawUrl.Trim()).Replace("\\/", "/", StringComparison.Ordinal); if (!Uri.TryCreate(baseUri, decoded, out Uri? imageUri)) return false; string[] segments = imageUri.AbsolutePath .Split('/', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries); if (segments.Length < 5) return false; if (!string.Equals(segments[0], "pub", StringComparison.OrdinalIgnoreCase) || !string.Equals(segments[1], "cdn", StringComparison.OrdinalIgnoreCase)) { return false; } string candidate = segments[2]; if (string.IsNullOrWhiteSpace(candidate)) return false; articleNumber = candidate.Trim(); return articleNumber.Length > 0; } private static bool TryExtractArticleNumberFromHref( string baseCollectionUrl, string? href, out string articleNumber) { articleNumber = string.Empty; if (string.IsNullOrWhiteSpace(href)) return false; if (!Uri.TryCreate(baseCollectionUrl, UriKind.Absolute, out Uri? baseUri)) return false; if (!Uri.TryCreate(baseUri, href.Trim(), out Uri? productUri)) return false; string[] segments = productUri.AbsolutePath .Split('/', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries); if (segments.Length == 0) return false; string slug = segments[^1]; if (string.IsNullOrWhiteSpace(slug)) return false; int lastDashIndex = slug.LastIndexOf('-'); if (lastDashIndex < 0 || lastDashIndex >= slug.Length - 1) return false; string candidate = slug[(lastDashIndex + 1)..].Trim(); if (candidate.Length == 0 || !candidate.All(char.IsDigit)) return false; articleNumber = candidate.Length < 6 ? candidate.PadLeft(6, '0') : candidate; return articleNumber.Length > 0; } private static bool TryExtractProductUrl( string baseCollectionUrl, string? href, out string productUrl) { productUrl = string.Empty; if (string.IsNullOrWhiteSpace(href)) return false; if (!Uri.TryCreate(baseCollectionUrl, UriKind.Absolute, out Uri? baseUri)) return false; if (!Uri.TryCreate(baseUri, href.Trim(), out Uri? productUri)) return false; if (!string.Equals(productUri.Scheme, Uri.UriSchemeHttp, StringComparison.OrdinalIgnoreCase) && !string.Equals(productUri.Scheme, Uri.UriSchemeHttps, StringComparison.OrdinalIgnoreCase)) { return false; } productUrl = productUri.AbsoluteUri; return true; } private static void ExtractFromCatalogData( IDocument document, string baseCollectionUrl, HashSet articleNumbers, HashSet productUrls) { foreach (IElement element in document.QuerySelectorAll("catalog-page-builder-view")) { string? rawCatalogData = element.GetAttribute(":catalog-data") ?? element.GetAttribute("catalog-data") ?? element.GetAttribute("v-bind:catalog-data"); if (string.IsNullOrWhiteSpace(rawCatalogData)) continue; string decoded = WebUtility.HtmlDecode(rawCatalogData.Trim()); if (decoded.Length == 0) continue; try { using JsonDocument json = JsonDocument.Parse(decoded); if (!json.RootElement.TryGetProperty("items", out JsonElement items) || items.ValueKind != JsonValueKind.Array) { continue; } foreach (JsonElement item in items.EnumerateArray()) { string? itemUrl = ReadString(item, "url"); if (TryExtractProductUrl(baseCollectionUrl, itemUrl, out string productUrl)) { productUrls.Add(productUrl); } else { string? urlKey = ReadString(item, "url_key"); if (TryExtractProductUrl(baseCollectionUrl, urlKey, out string productUrlFromUrlKey)) { productUrls.Add(productUrlFromUrlKey); } } string? placeholderUrl = ReadNestedString(item, "default_image", "placeholder_url"); if (TryExtractArticleNumberFromImageUrl(baseCollectionUrl, placeholderUrl, out string fromPlaceholder)) { articleNumbers.Add(fromPlaceholder); continue; } string? sku = ReadString(item, "sku"); if (TryNormalizeArticleNumber(sku, out string fromSku)) { articleNumbers.Add(fromSku); continue; } if (TryExtractArticleNumberFromHref(baseCollectionUrl, itemUrl, out string fromUrl)) { articleNumbers.Add(fromUrl); } } } catch (JsonException) { continue; } } } private static string? ReadString(JsonElement source, string propertyName) { if (!source.TryGetProperty(propertyName, out JsonElement value)) return null; return value.ValueKind == JsonValueKind.String ? value.GetString() : null; } private static string? ReadNestedString( JsonElement source, string parentPropertyName, string childPropertyName) { if (!source.TryGetProperty(parentPropertyName, out JsonElement parent) || parent.ValueKind != JsonValueKind.Object) { return null; } return ReadString(parent, childPropertyName); } private static bool TryNormalizeArticleNumber(string? rawValue, out string articleNumber) { articleNumber = string.Empty; if (string.IsNullOrWhiteSpace(rawValue)) return false; string trimmed = rawValue.Trim(); if (!trimmed.All(char.IsDigit)) return false; articleNumber = trimmed.Length < 6 ? trimmed.PadLeft(6, '0') : trimmed; return articleNumber.Length > 0; } private static string BuildPageUrl(string baseCollectionUrl, int page) { int safePage = Math.Max(1, page); string separator = baseCollectionUrl.Contains('?', StringComparison.Ordinal) ? "&" : "?"; return $"{baseCollectionUrl}{separator}page_size={PageSize}&page={safePage}"; } private static string NormalizeBaseCollectionUrl(string baseCollectionUrl) { if (string.IsNullOrWhiteSpace(baseCollectionUrl)) return string.Empty; string normalized = baseCollectionUrl.Trim(); int queryIndex = normalized.IndexOf('?'); if (queryIndex >= 0) normalized = normalized[..queryIndex]; return normalized; } public sealed class CollectionScanResult { public HashSet ArticleNumbers { get; init; } = new(StringComparer.OrdinalIgnoreCase); public HashSet ProductUrls { get; init; } = new(StringComparer.OrdinalIgnoreCase); } private sealed class CollectionPageItems { public HashSet ArticleNumbers { get; init; } = new(StringComparer.OrdinalIgnoreCase); public HashSet ProductUrls { get; init; } = new(StringComparer.OrdinalIgnoreCase); } }