Skip to content

Commit

Permalink
When extracting icons, instead of looking for the largest icon for ea…
Browse files Browse the repository at this point in the history
…ch value of `rel=` in `<link>` tags, pool them together first, and then look for a globally maximum size of icon.
  • Loading branch information
chimbori committed Apr 19, 2022
1 parent 4b99808 commit b848e0a
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 11 deletions.
22 changes: 13 additions & 9 deletions src/main/kotlin/com/chimbori/crux/extractors/MetadataHelpers.kt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import java.util.Locale
import okhttp3.HttpUrl
import okhttp3.HttpUrl.Companion.toHttpUrlOrNull
import org.jsoup.nodes.Document
import org.jsoup.select.Elements
import org.jsoup.nodes.Element

internal fun Document.extractTitle(): String? = (
title().nullIfBlank()
Expand Down Expand Up @@ -45,13 +45,17 @@ internal fun Document.extractKeywords(): List<String> =
.split("\\s*,\\s*".toRegex())
.filter { it.isNotBlank() }

internal fun Document.extractFaviconUrl(baseUrl: HttpUrl?): HttpUrl? = (
findLargestIcon(select("link[rel~=icon]"))
?: findLargestIcon(select("link[rel~=ICON]"))
?: findLargestIcon(select("link[rel~=apple-touch-icon]"))
?: findLargestIcon(select("link[rel~=apple-touch-icon-precomposed]"))
)?.let { baseUrl?.resolve(it) ?: it.toHttpUrlOrNull() }
?: baseUrl?.newBuilder()?.encodedPath("/favicon.ico")?.build()
internal fun Document.extractFaviconUrl(baseUrl: HttpUrl?): HttpUrl? {
val allPossibleIconElements = listOf(
select("link[rel~=apple-touch-icon]"),
select("link[rel~=apple-touch-icon-precomposed]"),
select("link[rel~=icon]"),
select("link[rel~=ICON]"),
)
return findLargestIcon(allPossibleIconElements.flatten())
?.let { baseUrl?.resolve(it) ?: it.toHttpUrlOrNull() }
?: baseUrl?.newBuilder()?.encodedPath("/favicon.ico")?.build()
}

internal fun Document.extractImageUrl(baseUrl: HttpUrl?): HttpUrl? = (
// Twitter Cards and Open Graph images are usually higher quality, so rank them first.
Expand All @@ -75,7 +79,7 @@ internal fun Document.extractVideoUrl(baseUrl: HttpUrl?): HttpUrl? =
select("meta[property=og:video]").attr("content").nullIfBlank()
?.let { baseUrl?.resolve(it) ?: it.toHttpUrlOrNull() }

internal fun findLargestIcon(iconElements: Elements): String? =
internal fun findLargestIcon(iconElements: List<Element>): String? =
iconElements.maxByOrNull { parseSize(it.attr("sizes")) }?.attr("href")?.nullIfBlank()

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,25 @@ class MetadataHelpersTest {
|<link rel="icon" sizes="72x72" href="/72.png">
|<link rel="icon" sizes="114x114" href="/114.png">
|<link rel="icon" sizes="144x144" href="/144.png">
|<link rel="icon" href="/no-size.png">
|<link rel="icon" href="/no-size.png">
""".trimMargin(), "https://example.org/"
).select("link[rel~=icon]")
).select("*")
)
)

assertEquals(
"/512.png",
findLargestIcon(
Jsoup.parse(
"""
|<link rel="apple-touch-icon-precomposed" sizes="512x512" href="/512.png">
|<link rel="apple-touch-icon" sizes="57x57" href="/57.png">
|<link rel="icon" sizes="72x72" href="/72.png">
|<link rel="icon" sizes="114x114" href="/114.png">
|<link rel="apple-touch-icon" sizes="144x144" href="/144.png">
""".trimMargin(), "https://example.org/"
).select("*")
)
)
}
Expand Down

0 comments on commit b848e0a

Please sign in to comment.