using AngleSharp.Dom;
using AngleSharp.Html.Parser;
using System.Net;
using System.Text.Json;
namespace Sonex.Worker.WebSync;
internal sealed class WebSyncCollectionLoader
{
private static readonly HttpClient CollectionHttpClient = new();
private const int PageSize = 96;
private const int MaxPages = 500;
private const string OperationCollectionParsing = "CollectionParsing";
public async Task DownloadCollectionAsync(
string baseCollectionUrl,
int retryCount,
int downloadTimeoutSeconds,
int retryDelaySeconds,
string collectionOperation,
WebSyncRunReport runReport,
Func waitIfPaused,
CancellationToken cancellationToken)
{
var articleNumbers = new HashSet(StringComparer.OrdinalIgnoreCase);
var productUrls = new HashSet(StringComparer.OrdinalIgnoreCase);
string normalizedBaseUrl = NormalizeBaseCollectionUrl(baseCollectionUrl);
for (int page = 1; page <= MaxPages; page++)
{
cancellationToken.ThrowIfCancellationRequested();
await waitIfPaused(cancellationToken).ConfigureAwait(false);
string pageUrl = BuildPageUrl(normalizedBaseUrl, page);
string html = await DownloadPageHtmlWithRetryAsync(
pageUrl,
retryCount,
downloadTimeoutSeconds,
retryDelaySeconds,
collectionOperation,
runReport,
waitIfPaused,
cancellationToken).ConfigureAwait(false);
CollectionPageItems pageItems = await ExtractCollectionPageItemsAsync(
normalizedBaseUrl,
html,
waitIfPaused,
cancellationToken).ConfigureAwait(false);
if (pageItems.ArticleNumbers.Count == 0 && pageItems.ProductUrls.Count == 0)
break;
foreach (string articleNumber in pageItems.ArticleNumbers)
{
articleNumbers.Add(articleNumber);
}
foreach (string productUrl in pageItems.ProductUrls)
{
productUrls.Add(productUrl);
}
if (pageItems.ProductUrls.Count < PageSize)
break;
}
return new CollectionScanResult
{
ArticleNumbers = articleNumbers,
ProductUrls = productUrls
};
}
private static async Task DownloadPageHtmlWithRetryAsync(
string pageUrl,
int retryCount,
int downloadTimeoutSeconds,
int retryDelaySeconds,
string collectionOperation,
WebSyncRunReport runReport,
Func waitIfPaused,
CancellationToken cancellationToken)
{
int attempts = Math.Max(1, retryCount + 1);
for (int attempt = 1; attempt <= attempts; attempt++)
{
cancellationToken.ThrowIfCancellationRequested();
await waitIfPaused(cancellationToken).ConfigureAwait(false);
try
{
using var timeoutCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
timeoutCts.CancelAfter(TimeSpan.FromSeconds(downloadTimeoutSeconds));
using HttpResponseMessage response = await CollectionHttpClient.GetAsync(
pageUrl,
HttpCompletionOption.ResponseHeadersRead,
timeoutCts.Token).ConfigureAwait(false);
response.EnsureSuccessStatusCode();
string html = await response.Content.ReadAsStringAsync(timeoutCts.Token).ConfigureAwait(false);
runReport.IncrementPageDownloaded();
return html;
}
catch (OperationCanceledException) when (!cancellationToken.IsCancellationRequested && attempt < attempts)
{
await waitIfPaused(cancellationToken).ConfigureAwait(false);
await Task.Delay(TimeSpan.FromSeconds(retryDelaySeconds), cancellationToken).ConfigureAwait(false);
}
catch (Exception) when (attempt < attempts)
{
await waitIfPaused(cancellationToken).ConfigureAwait(false);
await Task.Delay(TimeSpan.FromSeconds(retryDelaySeconds), cancellationToken).ConfigureAwait(false);
}
catch (Exception ex)
{
string message = $"Collection page download failed for {pageUrl} after {attempts} attempts.";
throw new WebSyncOperationException(
collectionOperation,
message,
ex);
}
}
throw new WebSyncOperationException(
collectionOperation,
$"Collection page download failed for {pageUrl}.",
new InvalidOperationException("Unknown collection page download failure."));
}
private static async Task ExtractCollectionPageItemsAsync(
string baseCollectionUrl,
string html,
Func waitIfPaused,
CancellationToken cancellationToken)
{
try
{
var parser = new HtmlParser();
IDocument document = await parser.ParseDocumentAsync(html, cancellationToken).ConfigureAwait(false);
var articleNumbers = new HashSet(StringComparer.OrdinalIgnoreCase);
var productUrls = new HashSet(StringComparer.OrdinalIgnoreCase);
foreach (IElement productTile in document.QuerySelectorAll("a.product__tile"))
{
cancellationToken.ThrowIfCancellationRequested();
await waitIfPaused(cancellationToken).ConfigureAwait(false);
string? href = productTile.GetAttribute("href");
if (TryExtractProductUrl(baseCollectionUrl, href, out string productUrl))
{
productUrls.Add(productUrl);
}
string? imageUrl = productTile.QuerySelector("img[data-src], img[src]")?.GetAttribute("data-src")
?? productTile.QuerySelector("img[data-src], img[src]")?.GetAttribute("src");
if (TryExtractArticleNumberFromImageUrl(baseCollectionUrl, imageUrl, out string fromImage))
{
articleNumbers.Add(fromImage);
continue;
}
if (TryExtractArticleNumberFromHref(baseCollectionUrl, href, out string fromHref))
{
articleNumbers.Add(fromHref);
}
}
ExtractFromCatalogData(document, baseCollectionUrl, articleNumbers, productUrls);
return new CollectionPageItems
{
ArticleNumbers = articleNumbers,
ProductUrls = productUrls
};
}
catch (Exception ex) when (ex is not OperationCanceledException)
{
throw new WebSyncOperationException(
OperationCollectionParsing,
"Collection page parsing failed.",
ex);
}
}
private static bool TryExtractArticleNumberFromImageUrl(
string baseCollectionUrl,
string? rawUrl,
out string articleNumber)
{
articleNumber = string.Empty;
if (string.IsNullOrWhiteSpace(rawUrl))
return false;
if (!Uri.TryCreate(baseCollectionUrl, UriKind.Absolute, out Uri? baseUri))
return false;
string decoded = WebUtility.HtmlDecode(rawUrl.Trim()).Replace("\\/", "/", StringComparison.Ordinal);
if (!Uri.TryCreate(baseUri, decoded, out Uri? imageUri))
return false;
string[] segments = imageUri.AbsolutePath
.Split('/', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
if (segments.Length < 5)
return false;
if (!string.Equals(segments[0], "pub", StringComparison.OrdinalIgnoreCase) ||
!string.Equals(segments[1], "cdn", StringComparison.OrdinalIgnoreCase))
{
return false;
}
string candidate = segments[2];
if (string.IsNullOrWhiteSpace(candidate))
return false;
articleNumber = candidate.Trim();
return articleNumber.Length > 0;
}
private static bool TryExtractArticleNumberFromHref(
string baseCollectionUrl,
string? href,
out string articleNumber)
{
articleNumber = string.Empty;
if (string.IsNullOrWhiteSpace(href))
return false;
if (!Uri.TryCreate(baseCollectionUrl, UriKind.Absolute, out Uri? baseUri))
return false;
if (!Uri.TryCreate(baseUri, href.Trim(), out Uri? productUri))
return false;
string[] segments = productUri.AbsolutePath
.Split('/', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
if (segments.Length == 0)
return false;
string slug = segments[^1];
if (string.IsNullOrWhiteSpace(slug))
return false;
int lastDashIndex = slug.LastIndexOf('-');
if (lastDashIndex < 0 || lastDashIndex >= slug.Length - 1)
return false;
string candidate = slug[(lastDashIndex + 1)..].Trim();
if (candidate.Length == 0 || !candidate.All(char.IsDigit))
return false;
articleNumber = candidate.Length < 6
? candidate.PadLeft(6, '0')
: candidate;
return articleNumber.Length > 0;
}
private static bool TryExtractProductUrl(
string baseCollectionUrl,
string? href,
out string productUrl)
{
productUrl = string.Empty;
if (string.IsNullOrWhiteSpace(href))
return false;
if (!Uri.TryCreate(baseCollectionUrl, UriKind.Absolute, out Uri? baseUri))
return false;
if (!Uri.TryCreate(baseUri, href.Trim(), out Uri? productUri))
return false;
if (!string.Equals(productUri.Scheme, Uri.UriSchemeHttp, StringComparison.OrdinalIgnoreCase) &&
!string.Equals(productUri.Scheme, Uri.UriSchemeHttps, StringComparison.OrdinalIgnoreCase))
{
return false;
}
productUrl = productUri.AbsoluteUri;
return true;
}
private static void ExtractFromCatalogData(
IDocument document,
string baseCollectionUrl,
HashSet articleNumbers,
HashSet productUrls)
{
foreach (IElement element in document.QuerySelectorAll("catalog-page-builder-view"))
{
string? rawCatalogData = element.GetAttribute(":catalog-data")
?? element.GetAttribute("catalog-data")
?? element.GetAttribute("v-bind:catalog-data");
if (string.IsNullOrWhiteSpace(rawCatalogData))
continue;
string decoded = WebUtility.HtmlDecode(rawCatalogData.Trim());
if (decoded.Length == 0)
continue;
try
{
using JsonDocument json = JsonDocument.Parse(decoded);
if (!json.RootElement.TryGetProperty("items", out JsonElement items) ||
items.ValueKind != JsonValueKind.Array)
{
continue;
}
foreach (JsonElement item in items.EnumerateArray())
{
string? itemUrl = ReadString(item, "url");
if (TryExtractProductUrl(baseCollectionUrl, itemUrl, out string productUrl))
{
productUrls.Add(productUrl);
}
else
{
string? urlKey = ReadString(item, "url_key");
if (TryExtractProductUrl(baseCollectionUrl, urlKey, out string productUrlFromUrlKey))
{
productUrls.Add(productUrlFromUrlKey);
}
}
string? placeholderUrl = ReadNestedString(item, "default_image", "placeholder_url");
if (TryExtractArticleNumberFromImageUrl(baseCollectionUrl, placeholderUrl, out string fromPlaceholder))
{
articleNumbers.Add(fromPlaceholder);
continue;
}
string? sku = ReadString(item, "sku");
if (TryNormalizeArticleNumber(sku, out string fromSku))
{
articleNumbers.Add(fromSku);
continue;
}
if (TryExtractArticleNumberFromHref(baseCollectionUrl, itemUrl, out string fromUrl))
{
articleNumbers.Add(fromUrl);
}
}
}
catch (JsonException)
{
continue;
}
}
}
private static string? ReadString(JsonElement source, string propertyName)
{
if (!source.TryGetProperty(propertyName, out JsonElement value))
return null;
return value.ValueKind == JsonValueKind.String
? value.GetString()
: null;
}
private static string? ReadNestedString(
JsonElement source,
string parentPropertyName,
string childPropertyName)
{
if (!source.TryGetProperty(parentPropertyName, out JsonElement parent) ||
parent.ValueKind != JsonValueKind.Object)
{
return null;
}
return ReadString(parent, childPropertyName);
}
private static bool TryNormalizeArticleNumber(string? rawValue, out string articleNumber)
{
articleNumber = string.Empty;
if (string.IsNullOrWhiteSpace(rawValue))
return false;
string trimmed = rawValue.Trim();
if (!trimmed.All(char.IsDigit))
return false;
articleNumber = trimmed.Length < 6
? trimmed.PadLeft(6, '0')
: trimmed;
return articleNumber.Length > 0;
}
private static string BuildPageUrl(string baseCollectionUrl, int page)
{
int safePage = Math.Max(1, page);
string separator = baseCollectionUrl.Contains('?', StringComparison.Ordinal) ? "&" : "?";
return $"{baseCollectionUrl}{separator}page_size={PageSize}&page={safePage}";
}
private static string NormalizeBaseCollectionUrl(string baseCollectionUrl)
{
if (string.IsNullOrWhiteSpace(baseCollectionUrl))
return string.Empty;
string normalized = baseCollectionUrl.Trim();
int queryIndex = normalized.IndexOf('?');
if (queryIndex >= 0)
normalized = normalized[..queryIndex];
return normalized;
}
public sealed class CollectionScanResult
{
public HashSet ArticleNumbers { get; init; } = new(StringComparer.OrdinalIgnoreCase);
public HashSet ProductUrls { get; init; } = new(StringComparer.OrdinalIgnoreCase);
}
private sealed class CollectionPageItems
{
public HashSet ArticleNumbers { get; init; } = new(StringComparer.OrdinalIgnoreCase);
public HashSet ProductUrls { get; init; } = new(StringComparer.OrdinalIgnoreCase);
}
}