using AngleSharp.Dom; using AngleSharp.Html.Parser; using Sonex.Data.Records; using System.Net; using System.Text; using System.Text.Json; using WorkerCore = Sonex.Library.WorkersCore.Worker; namespace Sonex.Worker.WebSync; internal sealed class WebSyncProductTaskRunner { private static readonly HttpClient ProductHttpClient = new(); private const string OperationProductTaskExecution = "ProductTaskExecution"; private const string OperationPageDownload = "PageDownload"; private const string OperationProductUpsert = "ProductUpsert"; private const string DictionaryTableProducts = "products"; private const string ArticleNumberKey = "Artikelnummer"; private const string OnlineOnlyKey = "Online Only"; private const string MaterialKey = "Materiaal"; private const string ColorKey = "Kleur"; private static readonly HashSet SupportedImageExtensions = new(StringComparer.OrdinalIgnoreCase) { ".jpg", ".jpeg", ".png", ".webp" }; private readonly WebSyncImageSynchronizer _imageSynchronizer = new(); public async Task ExecuteWithRetryAsync( string productUrl, string targetImagesPath, bool updateImages, WebSyncCollectionSnapshot collectionSnapshot, int retryCount, int downloadTimeoutSeconds, int retryDelaySeconds, WebSyncRunReport runReport, Func waitIfPaused, CancellationToken cancellationToken) { int attempts = Math.Max(1, retryCount + 1); for (int attempt = 1; attempt <= attempts; attempt++) { cancellationToken.ThrowIfCancellationRequested(); await waitIfPaused(cancellationToken).ConfigureAwait(false); try { using var timeoutCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken); timeoutCts.CancelAfter(TimeSpan.FromSeconds(downloadTimeoutSeconds)); await ExecuteSingleTaskAsync( productUrl, targetImagesPath, updateImages, collectionSnapshot, runReport, waitIfPaused, timeoutCts.Token).ConfigureAwait(false); return; } catch (OperationCanceledException) when (!cancellationToken.IsCancellationRequested && attempt < attempts) { await waitIfPaused(cancellationToken).ConfigureAwait(false); await Task.Delay(TimeSpan.FromSeconds(retryDelaySeconds), cancellationToken).ConfigureAwait(false); } catch (Exception) when (attempt < attempts) { await waitIfPaused(cancellationToken).ConfigureAwait(false); await Task.Delay(TimeSpan.FromSeconds(retryDelaySeconds), cancellationToken).ConfigureAwait(false); } catch (Exception ex) { string operation = ex is WebSyncOperationException operationException ? operationException.Operation : OperationProductTaskExecution; string message = $"Task failed for {productUrl} after {attempts} attempts. Details={ex.Message}"; PrintTaskConsoleError(productUrl, attempt, attempts, ex); runReport.RegisterException(operation, ex, message); WorkerCore.LogError(message, ex, operation); return; } } } private async Task ExecuteSingleTaskAsync( string productUrl, string targetImagesPath, bool updateImages, WebSyncCollectionSnapshot collectionSnapshot, WebSyncRunReport runReport, Func waitIfPaused, CancellationToken cancellationToken) { await waitIfPaused(cancellationToken).ConfigureAwait(false); DownloadPageResult downloadedPage = await DownloadPageHtmlAsync( productUrl, runReport, cancellationToken).ConfigureAwait(false); await waitIfPaused(cancellationToken).ConfigureAwait(false); ProductPageData? product = await ExtractProductResultAsync( downloadedPage.FinalUrl, downloadedPage.Html, cancellationToken).ConfigureAwait(false); if (product is null) return; await waitIfPaused(cancellationToken).ConfigureAwait(false); await UpsertProductAsync(product, collectionSnapshot, runReport, cancellationToken).ConfigureAwait(false); if (updateImages) { await waitIfPaused(cancellationToken).ConfigureAwait(false); await _imageSynchronizer.UpdateAsync( product.ArticleNumber, product.ImageUrls, targetImagesPath, runReport, waitIfPaused, cancellationToken).ConfigureAwait(false); } } private static async Task UpsertProductAsync( ProductPageData product, WebSyncCollectionSnapshot collectionSnapshot, WebSyncRunReport runReport, CancellationToken cancellationToken) { bool isTrending = collectionSnapshot.IsTrending(product.ArticleNumber); bool isNieuw = collectionSnapshot.IsNieuw(product.ArticleNumber); bool isActies = collectionSnapshot.IsActies(product.ArticleNumber); var productRecord = new ProductWebInfoRecord { Artnr = product.ArticleNumber, Url = product.Url, Title = product.Title, Keywords = product.Keywords, Notification = product.Notification, PromoText = product.PromoText, StockMessage = product.StockMessage, Category = product.Category, RootCategory = product.RootCategory, LastCategory = product.LastCategory, Description = product.Description, Material = product.Material, Color = product.Color, Series = product.Series, MinimumOrderQuantity = product.MinimumOrderQuantity, OnlineOnly = product.OnlineOnly, CanOrder = product.CanOrder, IsTrending = isTrending, IsNieuw = isNieuw, IsActies = isActies, }; var upsertResult = await WorkerCore.ExecuteDatabaseSingleWithRetryAsync( ct => productRecord.Upsert(ct), OperationProductUpsert, cancellationToken).ConfigureAwait(false); if (!upsertResult.Success || upsertResult.Item != true) { if (cancellationToken.IsCancellationRequested || IsCancellationError(upsertResult.ErrorType, upsertResult.ErrorMessage)) { throw new OperationCanceledException( upsertResult.ErrorMessage ?? "The operation was canceled.", cancellationToken); } string message = upsertResult.ErrorMessage ?? $"Unknown database error for article {product.ArticleNumber}."; string errorType = string.IsNullOrWhiteSpace(upsertResult.ErrorType) ? "Unknown" : upsertResult.ErrorType; throw new WebSyncOperationException( OperationProductUpsert, $"Product upsert failed for article {product.ArticleNumber}. DbErrorType={errorType}. Message={message}", new InvalidOperationException(message)); } runReport.IncrementPageUpdated(); RegisterDictionaryValues(product, runReport); } private static void RegisterDictionaryValues( ProductPageData product, WebSyncRunReport runReport) { runReport.RegisterDictionaryValue(DictionaryTableProducts, "title", product.Title); runReport.RegisterDictionaryValue(DictionaryTableProducts, "keywords", product.Keywords); runReport.RegisterDictionaryValue(DictionaryTableProducts, "notification", product.Notification); runReport.RegisterDictionaryValue(DictionaryTableProducts, "promo_text", product.PromoText); runReport.RegisterDictionaryValue(DictionaryTableProducts, "stock_message", product.StockMessage); runReport.RegisterDictionaryValue(DictionaryTableProducts, "category", product.Category); runReport.RegisterDictionaryValue(DictionaryTableProducts, "root_category", product.RootCategory); runReport.RegisterDictionaryValue(DictionaryTableProducts, "last_category", product.LastCategory); runReport.RegisterDictionaryValue(DictionaryTableProducts, "description", product.Description); runReport.RegisterDictionaryValue(DictionaryTableProducts, "material", product.Material); runReport.RegisterDictionaryValue(DictionaryTableProducts, "color", product.Color); runReport.RegisterDictionaryValue(DictionaryTableProducts, "series", product.Series); } private static async Task DownloadPageHtmlAsync( string productUrl, WebSyncRunReport runReport, CancellationToken cancellationToken) { try { using var response = await ProductHttpClient.GetAsync( productUrl, HttpCompletionOption.ResponseHeadersRead, cancellationToken).ConfigureAwait(false); response.EnsureSuccessStatusCode(); string html = await response.Content.ReadAsStringAsync(cancellationToken).ConfigureAwait(false); string finalUrl = response.RequestMessage?.RequestUri?.AbsoluteUri ?? productUrl; runReport.IncrementPageDownloaded(); return new DownloadPageResult { Html = html, FinalUrl = finalUrl }; } catch (Exception ex) when (ex is not OperationCanceledException) { throw new WebSyncOperationException( OperationPageDownload, $"Page download failed for {productUrl}.", ex); } } private static async Task ExtractProductResultAsync( string finalUrl, string html, CancellationToken cancellationToken) { var parser = new HtmlParser(); IDocument document = await parser.ParseDocumentAsync(html, cancellationToken).ConfigureAwait(false); ProductSpecifications specifications = ExtractSpecifications(document); CatalogProductData? catalogData = ExtractCatalogProductData( document, finalUrl, specifications.ArticleNumber); string articleNumber = FirstNotEmpty(specifications.ArticleNumber, catalogData?.ArticleNumber); if (string.IsNullOrWhiteSpace(articleNumber)) return null; string title = FirstNotEmpty( NormalizeText(document.QuerySelector("div.product__header h1")?.TextContent), catalogData?.Title); if (string.IsNullOrWhiteSpace(title)) return null; IReadOnlyList categoryNames = catalogData?.CategoryNames ?? []; CategoryData categoryData = ExtractCategoryData(document, categoryNames); string stockMessage = NormalizeText(document.QuerySelector("div.stock__message span")?.TextContent); bool canOrder = ResolveCanOrder(document, stockMessage, catalogData); bool onlineOnly = specifications.HasOnlineOnly ? specifications.OnlineOnly : (catalogData?.OnlineOnly ?? false); int? minimumOrderQuantity = ResolveMinimumOrderQuantity(document, catalogData); return new ProductPageData { Url = FirstNotEmpty(finalUrl, catalogData?.Url), ArticleNumber = articleNumber, Title = title, Keywords = FirstNotEmpty(ExtractKeywords(document), catalogData?.Keywords), StockMessage = stockMessage, Notification = NormalizeText(document.QuerySelector("div.product__notifications p")?.TextContent), PromoText = ExtractPromoText(document), Category = categoryData.FullCategory, RootCategory = categoryData.RootCategory, LastCategory = categoryData.LastCategory, Description = ResolveDescription(document, catalogData?.DescriptionHtml), Material = specifications.Material, Color = specifications.Color, Series = specifications.Series, MinimumOrderQuantity = minimumOrderQuantity, OnlineOnly = onlineOnly, CanOrder = canOrder, ImageUrls = ExtractImageUrls(finalUrl, document, catalogData?.ImageGalleryPlaceholders) }; } private static CatalogProductData? ExtractCatalogProductData( IDocument document, string finalUrl, string articleNumberFromSpecifications) { var candidates = new List(); foreach (IElement element in document.QuerySelectorAll("catalog-product-configuration")) { CatalogProductData? data = ParseCatalogProductJson(element.GetAttribute(":product-data")); if (data is not null) candidates.Add(data); } foreach (IElement element in document.QuerySelectorAll("catalog-product-family-products")) { CatalogProductData? data = ParseCatalogProductJson(element.GetAttribute(":parent-product")); if (data is not null) candidates.Add(data); } if (candidates.Count == 0) return null; string normalizedArticleNumber = NormalizeArticleNumber(articleNumberFromSpecifications); string normalizedFinalUrlPath = NormalizeComparableUrlPath(finalUrl); CatalogProductData? exactMatch = candidates.FirstOrDefault(candidate => IsArticleNumberMatch(candidate.ArticleNumber, normalizedArticleNumber) && IsUrlMatch(candidate.Url, normalizedFinalUrlPath)); if (exactMatch is not null) return exactMatch; CatalogProductData? articleMatch = candidates.FirstOrDefault(candidate => IsArticleNumberMatch(candidate.ArticleNumber, normalizedArticleNumber)); if (articleMatch is not null) return articleMatch; CatalogProductData? urlMatch = candidates.FirstOrDefault(candidate => IsUrlMatch(candidate.Url, normalizedFinalUrlPath)); if (urlMatch is not null) return urlMatch; return candidates[0]; } private static CatalogProductData? ParseCatalogProductJson(string? rawJson) { if (string.IsNullOrWhiteSpace(rawJson)) return null; string decodedJson = WebUtility.HtmlDecode(rawJson); if (string.IsNullOrWhiteSpace(decodedJson)) return null; try { using JsonDocument parsed = JsonDocument.Parse(decodedJson); JsonElement root = parsed.RootElement; return new CatalogProductData { ArticleNumber = NormalizeText(GetJsonString(root, "sku")), Title = NormalizeText(GetJsonString(root, "name")), Url = NormalizeText(GetJsonString(root, "url")), Keywords = NormalizeText(GetJsonString(root, "meta_keywords")), DescriptionHtml = GetJsonString(root, "description"), IsSalable = GetJsonBool(root, "is_salable"), OnlineOnly = GetJsonBool(root, "online_only"), InStock = GetJsonBool(root, "in_stock"), StatusInStock = GetJsonBool(root, "status_in_stock"), FrontendStock = GetJsonInt(root, "frontend_stock"), NotSaleable = GetJsonBool(root, "not_saleable"), XenosMinBestelhoeveelheid = GetJsonInt(root, "xenos_min_bestelhoeveelheid"), MinimumQtyAllowedIncrement = GetMinimumQtyAllowedIncrement(root), CategoryNames = GetJsonStringArray(root, "category_names"), ImageGalleryPlaceholders = GetJsonStringArray(root, "image_gallery_placeholders") }; } catch { return null; } } private static string GetJsonString(JsonElement root, string propertyName) { if (!root.TryGetProperty(propertyName, out JsonElement element)) return string.Empty; return element.ValueKind switch { JsonValueKind.String => element.GetString() ?? string.Empty, JsonValueKind.Number => element.GetRawText(), JsonValueKind.True => "true", JsonValueKind.False => "false", _ => string.Empty }; } private static bool? GetJsonBool(JsonElement root, string propertyName) { if (!root.TryGetProperty(propertyName, out JsonElement element)) return null; if (element.ValueKind == JsonValueKind.True) return true; if (element.ValueKind == JsonValueKind.False) return false; if (element.ValueKind == JsonValueKind.Number && element.TryGetInt32(out int intValue)) return intValue != 0; if (element.ValueKind == JsonValueKind.String) { string value = NormalizeText(element.GetString()); if (string.IsNullOrWhiteSpace(value)) return null; if (bool.TryParse(value, out bool boolValue)) return boolValue; if (int.TryParse(value, out int parsedInt)) return parsedInt != 0; if (string.Equals(value, "ja", StringComparison.OrdinalIgnoreCase)) return true; if (string.Equals(value, "nee", StringComparison.OrdinalIgnoreCase)) return false; } return null; } private static int? GetJsonInt(JsonElement root, string propertyName) { if (!root.TryGetProperty(propertyName, out JsonElement element)) return null; if (element.ValueKind == JsonValueKind.Number) { if (element.TryGetInt32(out int numberValue)) return numberValue; return null; } if (element.ValueKind == JsonValueKind.String && TryParsePositiveInteger(element.GetString(), out int parsedValue)) { return parsedValue; } return null; } private static int? GetMinimumQtyAllowedIncrement(JsonElement root) { if (!root.TryGetProperty("minimum_qty_allowed", out JsonElement minimumQtyAllowed)) return null; if (minimumQtyAllowed.ValueKind != JsonValueKind.Object) return null; if (!minimumQtyAllowed.TryGetProperty("increment", out JsonElement increment)) return null; if (increment.ValueKind == JsonValueKind.Number && increment.TryGetInt32(out int numberValue)) return numberValue; if (increment.ValueKind == JsonValueKind.String && TryParsePositiveInteger(increment.GetString(), out int parsedValue)) { return parsedValue; } return null; } private static List GetJsonStringArray(JsonElement root, string propertyName) { var values = new List(); if (!root.TryGetProperty(propertyName, out JsonElement element) || element.ValueKind != JsonValueKind.Array) { return values; } foreach (JsonElement item in element.EnumerateArray()) { if (item.ValueKind != JsonValueKind.String) continue; string value = NormalizeText(item.GetString()); if (string.IsNullOrWhiteSpace(value)) continue; values.Add(value); } return values; } private static string FirstNotEmpty(string? first, string? second) { if (!string.IsNullOrWhiteSpace(first)) return first.Trim(); return string.IsNullOrWhiteSpace(second) ? string.Empty : second.Trim(); } private static string NormalizeArticleNumber(string? articleNumber) { return string.IsNullOrWhiteSpace(articleNumber) ? string.Empty : articleNumber.Trim(); } private static bool IsArticleNumberMatch(string candidateArticleNumber, string normalizedExpectedArticleNumber) { if (string.IsNullOrWhiteSpace(normalizedExpectedArticleNumber)) return false; string normalizedCandidate = NormalizeArticleNumber(candidateArticleNumber); if (string.IsNullOrWhiteSpace(normalizedCandidate)) return false; return string.Equals( normalizedCandidate, normalizedExpectedArticleNumber, StringComparison.OrdinalIgnoreCase); } private static bool IsUrlMatch(string candidateUrl, string normalizedFinalUrlPath) { if (string.IsNullOrWhiteSpace(normalizedFinalUrlPath)) return false; string normalizedCandidatePath = NormalizeComparableUrlPath(candidateUrl); if (string.IsNullOrWhiteSpace(normalizedCandidatePath)) return false; return string.Equals( normalizedCandidatePath, normalizedFinalUrlPath, StringComparison.OrdinalIgnoreCase); } private static string NormalizeComparableUrlPath(string? url) { if (string.IsNullOrWhiteSpace(url)) return string.Empty; string decoded = WebUtility.HtmlDecode(url).Replace("\\/", "/", StringComparison.Ordinal).Trim(); if (Uri.TryCreate(decoded, UriKind.Absolute, out Uri? absoluteUri)) return NormalizeUrlPath(absoluteUri.AbsolutePath); if (Uri.TryCreate(decoded, UriKind.Relative, out Uri? relativeUri)) return NormalizeUrlPath(relativeUri.OriginalString); return NormalizeUrlPath(decoded); } private static string NormalizeUrlPath(string value) { if (string.IsNullOrWhiteSpace(value)) return string.Empty; string normalized = value; int queryIndex = normalized.IndexOf('?'); if (queryIndex >= 0) normalized = normalized[..queryIndex]; int fragmentIndex = normalized.IndexOf('#'); if (fragmentIndex >= 0) normalized = normalized[..fragmentIndex]; normalized = normalized.Trim(); if (!normalized.StartsWith("/", StringComparison.Ordinal)) normalized = "/" + normalized; normalized = normalized.TrimEnd('/'); if (normalized.Length == 0) normalized = "/"; return normalized.ToLowerInvariant(); } private static string ResolveDescription(IDocument document, string? descriptionHtml) { string normalizedFromDocument = ExtractDescriptionFromDocument(document); if (!string.IsNullOrWhiteSpace(normalizedFromDocument)) return normalizedFromDocument; return ExtractDescriptionFromHtml(descriptionHtml); } private static string ExtractDescriptionFromHtml(string? descriptionHtml) { if (string.IsNullOrWhiteSpace(descriptionHtml)) return string.Empty; string decoded = WebUtility.HtmlDecode(descriptionHtml); string html = $"
{decoded}
"; var parser = new HtmlParser(); IDocument fragmentDocument = parser.ParseDocument(html); IElement? root = fragmentDocument.QuerySelector("div"); if (root is null) return string.Empty; return NormalizeText(root.TextContent); } private static string ExtractDescriptionFromDocument(IDocument document) { IElement? contentNode = document.QuerySelector("div.product__description div._content") ?? document.QuerySelector("div.product__description"); if (contentNode is null) return string.Empty; var paragraphs = contentNode.QuerySelectorAll("p"); var lines = new List(); foreach (IElement paragraph in paragraphs) { string text = NormalizeText(paragraph.TextContent); if (!string.IsNullOrWhiteSpace(text)) lines.Add(text); } if (lines.Count > 0) return string.Join(Environment.NewLine, lines); return NormalizeText(contentNode.TextContent); } private static bool ResolveCanOrder( IDocument document, string stockMessage, CatalogProductData? catalogData) { if (IsUnavailableStockMessage(stockMessage)) return false; if (catalogData is not null) { if (catalogData.IsSalable.HasValue && !catalogData.IsSalable.Value) return false; if (catalogData.NotSaleable.HasValue && catalogData.NotSaleable.Value) return false; if (catalogData.StatusInStock.HasValue && !catalogData.StatusInStock.Value) return false; if (catalogData.InStock.HasValue && !catalogData.InStock.Value) return false; if (catalogData.FrontendStock.HasValue && catalogData.FrontendStock.Value <= 0) return false; return true; } if (HasAddToCartButton(document)) return true; return true; } private static bool HasAddToCartButton(IDocument document) { return document.QuerySelector("div.add-to-cart__holder button") is not null; } private static bool IsUnavailableStockMessage(string stockMessage) { if (string.IsNullOrWhiteSpace(stockMessage)) return false; string normalized = stockMessage.ToLowerInvariant(); return normalized.Contains("uitverkocht", StringComparison.Ordinal) || normalized.Contains("niet verkoopbaar", StringComparison.Ordinal) || normalized.Contains("niet online", StringComparison.Ordinal) || normalized.Contains("alleen verkrijgbaar in onze winkels", StringComparison.Ordinal) || normalized.Contains("verschijnt binnenkort online", StringComparison.Ordinal); } private static string ExtractKeywords(IDocument document) { foreach (IElement meta in document.QuerySelectorAll("meta[name][content]")) { string name = meta.GetAttribute("name") ?? string.Empty; if (!name.Equals("keywords", StringComparison.OrdinalIgnoreCase)) continue; string content = meta.GetAttribute("content") ?? string.Empty; return NormalizeText(content); } return string.Empty; } private static CategoryData ExtractCategoryData( IDocument document, IReadOnlyList categoryNamesFromProductData) { var names = new List(); var seen = new HashSet(StringComparer.OrdinalIgnoreCase); foreach (IElement element in document.QuerySelectorAll("div.breadcrumbs [property='name']")) { string name = NormalizeText(element.TextContent); if (name.StartsWith("Terug naar", StringComparison.OrdinalIgnoreCase)) { name = NormalizeText(name["Terug naar".Length..]); } if (string.IsNullOrWhiteSpace(name)) continue; if (string.Equals(name, "Home", StringComparison.OrdinalIgnoreCase)) continue; if (seen.Add(name)) names.Add(name); } if (names.Count == 0) { foreach (string categoryName in categoryNamesFromProductData) { if (string.IsNullOrWhiteSpace(categoryName)) continue; string normalized = NormalizeText(categoryName); if (seen.Add(normalized)) names.Add(normalized); } } if (names.Count == 0) return new CategoryData(); return new CategoryData { FullCategory = string.Join(" > ", names), RootCategory = names[0], LastCategory = names[^1] }; } private static string ExtractPromoText(IDocument document) { IElement? promoNode = document.QuerySelector("div.product__promo"); if (promoNode is null) return string.Empty; return NormalizeText(promoNode.TextContent); } private static ProductSpecifications ExtractSpecifications(IDocument document) { var result = new ProductSpecifications(); foreach (IElement item in document.QuerySelectorAll("div.product__specifications ul li")) { IElement[] spans = item.QuerySelectorAll("span").Take(2).ToArray(); if (spans.Length < 2) continue; string key = NormalizeText(spans[0].TextContent); if (string.IsNullOrWhiteSpace(key)) continue; string value = NormalizeText(spans[1].TextContent); switch (key) { case ArticleNumberKey: result.ArticleNumber = value; break; case OnlineOnlyKey: result.HasOnlineOnly = true; result.OnlineOnly = ParseOnlineOnly(value); break; case MaterialKey: result.Material = value; break; case ColorKey: result.Color = value; break; default: if (key.Contains("serie", StringComparison.OrdinalIgnoreCase)) { result.Series = value; } break; } } return result; } private static int? ResolveMinimumOrderQuantity(IDocument document, CatalogProductData? catalogData) { int? fromSelect = ExtractMinimumOrderQuantityFromSelect(document); if (fromSelect.HasValue && fromSelect.Value > 0) return fromSelect; int? fromXenosField = catalogData?.XenosMinBestelhoeveelheid; if (fromXenosField.HasValue && fromXenosField.Value > 0) return fromXenosField; int? fromIncrement = catalogData?.MinimumQtyAllowedIncrement; if (fromIncrement.HasValue && fromIncrement.Value > 0) return fromIncrement; return null; } private static int? ExtractMinimumOrderQuantityFromSelect(IDocument document) { int? minValue = null; IEnumerable options = document.QuerySelectorAll("div.product__qty select option, select[name='qty'] option"); foreach (IElement option in options) { if (!TryParsePositiveInteger(option.GetAttribute("value"), out int value) && !TryParsePositiveInteger(option.TextContent, out value)) { continue; } if (!minValue.HasValue || value < minValue.Value) minValue = value; } return minValue; } private static bool TryParsePositiveInteger(string? rawValue, out int value) { value = 0; if (string.IsNullOrWhiteSpace(rawValue)) return false; string normalized = NormalizeText(rawValue); if (normalized.Length == 0) return false; var digits = new StringBuilder(normalized.Length); foreach (char ch in normalized) { if (char.IsDigit(ch)) { digits.Append(ch); continue; } if (digits.Length > 0) break; } if (digits.Length == 0) return false; if (!int.TryParse(digits.ToString(), out value)) return false; return value > 0; } private static bool ParseOnlineOnly(string value) { string normalized = NormalizeText(value); if (string.Equals(normalized, "Ja", StringComparison.OrdinalIgnoreCase)) return true; if (string.Equals(normalized, "Nee", StringComparison.OrdinalIgnoreCase)) return false; if (bool.TryParse(normalized, out bool boolValue)) return boolValue; return string.Equals(normalized, "1", StringComparison.Ordinal); } private static List ExtractImageUrls( string pageUrl, IDocument document, IReadOnlyList? placeholdersFromProductData) { var urls = new List(); var seen = new HashSet(StringComparer.OrdinalIgnoreCase); foreach (IElement element in document.QuerySelectorAll("div.product__media-gallery [data-zoom], div[class*='--gallery'] [data-zoom]")) { AddImageUrl(pageUrl, element.GetAttribute("data-zoom"), urls, seen); } if (urls.Count == 0 && placeholdersFromProductData is not null && placeholdersFromProductData.Count > 0) { foreach (string placeholder in placeholdersFromProductData) { AddImageUrl(pageUrl, placeholder, urls, seen); } } return urls; } private static void AddImageUrl( string pageUrl, string? rawUrl, List urls, HashSet seen) { if (!TryNormalizeImageUrl(pageUrl, rawUrl, out string normalized)) return; if (seen.Add(normalized)) urls.Add(normalized); } private static bool TryNormalizeImageUrl( string pageUrl, string? rawUrl, out string imageUrl) { imageUrl = string.Empty; if (string.IsNullOrWhiteSpace(rawUrl)) return false; if (!Uri.TryCreate(pageUrl, UriKind.Absolute, out Uri? pageUri)) return false; string normalizedRaw = WebUtility.HtmlDecode(rawUrl.Trim()).Replace("\\/", "/", StringComparison.Ordinal); if (!Uri.TryCreate(pageUri, normalizedRaw, out Uri? imageUri)) return false; if (!string.Equals(imageUri.Scheme, Uri.UriSchemeHttp, StringComparison.OrdinalIgnoreCase) && !string.Equals(imageUri.Scheme, Uri.UriSchemeHttps, StringComparison.OrdinalIgnoreCase)) { return false; } if (!imageUri.AbsolutePath.Contains("/pub/cdn/", StringComparison.OrdinalIgnoreCase)) return false; string extension = Path.GetExtension(imageUri.AbsolutePath); if (!SupportedImageExtensions.Contains(extension)) return false; imageUrl = imageUri.AbsoluteUri; return true; } private static string NormalizeText(string? value) { if (string.IsNullOrWhiteSpace(value)) return string.Empty; string decoded = WebUtility.HtmlDecode(value).Replace('\u00A0', ' '); return CollapseWhitespace(decoded).Trim(); } private static string CollapseWhitespace(string value) { if (string.IsNullOrEmpty(value)) return string.Empty; var buffer = new char[value.Length]; int write = 0; bool previousWasWhitespace = false; for (int i = 0; i < value.Length; i++) { char current = value[i]; bool isWhitespace = char.IsWhiteSpace(current); if (isWhitespace) { if (previousWasWhitespace) continue; buffer[write++] = ' '; previousWasWhitespace = true; continue; } buffer[write++] = current; previousWasWhitespace = false; } return new string(buffer, 0, write); } private static void PrintTaskConsoleError(string productUrl, int attempt, int totalAttempts, Exception exception) { Console.Error.WriteLine( $"[{DateTime.Now:yyyy-MM-dd HH:mm:ss}] [ProductTaskExecution] Task failed for {productUrl}. Attempt {attempt}/{totalAttempts}. ExceptionType={exception.GetType().Name}"); Console.Error.WriteLine(exception.ToString()); } private static bool IsCancellationError(string? errorType, string? errorMessage) { if (!string.IsNullOrWhiteSpace(errorType)) { if (errorType.Contains("OperationCanceledException", StringComparison.Ordinal) || errorType.Contains("TaskCanceledException", StringComparison.Ordinal)) { return true; } } if (!string.IsNullOrWhiteSpace(errorMessage)) { return errorMessage.Contains("operation was canceled", StringComparison.OrdinalIgnoreCase) || errorMessage.Contains("operation cancelled", StringComparison.OrdinalIgnoreCase); } return false; } private sealed class ProductPageData { public string Url { get; init; } = string.Empty; public string ArticleNumber { get; init; } = string.Empty; public string Title { get; init; } = string.Empty; public string Keywords { get; init; } = string.Empty; public string StockMessage { get; init; } = string.Empty; public string Notification { get; init; } = string.Empty; public string PromoText { get; init; } = string.Empty; public string Category { get; init; } = string.Empty; public string RootCategory { get; init; } = string.Empty; public string LastCategory { get; init; } = string.Empty; public string Description { get; init; } = string.Empty; public string Material { get; init; } = string.Empty; public string Color { get; init; } = string.Empty; public string Series { get; init; } = string.Empty; public int? MinimumOrderQuantity { get; init; } public bool OnlineOnly { get; init; } public bool CanOrder { get; init; } public List ImageUrls { get; init; } = []; } private sealed class DownloadPageResult { public string Html { get; init; } = string.Empty; public string FinalUrl { get; init; } = string.Empty; } private sealed class ProductSpecifications { public string ArticleNumber { get; set; } = string.Empty; public bool HasOnlineOnly { get; set; } public bool OnlineOnly { get; set; } public string Material { get; set; } = string.Empty; public string Color { get; set; } = string.Empty; public string Series { get; set; } = string.Empty; } private sealed class CatalogProductData { public string ArticleNumber { get; init; } = string.Empty; public string Title { get; init; } = string.Empty; public string Url { get; init; } = string.Empty; public string Keywords { get; init; } = string.Empty; public string DescriptionHtml { get; init; } = string.Empty; public bool? IsSalable { get; init; } public bool? OnlineOnly { get; init; } public bool? InStock { get; init; } public bool? StatusInStock { get; init; } public int? FrontendStock { get; init; } public bool? NotSaleable { get; init; } public int? XenosMinBestelhoeveelheid { get; init; } public int? MinimumQtyAllowedIncrement { get; init; } public List CategoryNames { get; init; } = []; public List ImageGalleryPlaceholders { get; init; } = []; } private sealed class CategoryData { public string FullCategory { get; init; } = string.Empty; public string RootCategory { get; init; } = string.Empty; public string LastCategory { get; init; } = string.Empty; } }