From dc69d2a84800e039275519491edc29387c9df50b Mon Sep 17 00:00:00 2001 From: Stefano Peluchetti Date: Sun, 13 Feb 2022 05:37:07 +0000 Subject: [PATCH 1/5] Add support for author. --- src/assets/index.html | 1 + src/parser/atom.go | 3 +++ src/parser/json.go | 13 +++++++++++++ src/parser/models.go | 9 +++++---- src/parser/rdf.go | 8 ++++++++ src/parser/rss.go | 9 +++++++++ src/storage/item.go | 12 ++++++++---- src/worker/crawler.go | 1 + 8 files changed, 48 insertions(+), 8 deletions(-) diff --git a/src/assets/index.html b/src/assets/index.html index 9cdc8443..3213d025 100644 --- a/src/assets/index.html +++ b/src/assets/index.html @@ -343,6 +343,7 @@

{{ itemSelectedDetails.title || 'untitled' }}

{{ feedsById[itemSelectedDetails.feed_id].title }}
+

diff --git a/src/parser/atom.go b/src/parser/atom.go index b7720766..dd6a396d 100644 --- a/src/parser/atom.go +++ b/src/parser/atom.go @@ -29,6 +29,8 @@ type atomEntry struct { OrigLink string `xml:"http://rssnamespace.org/feedburner/ext/1.0 origLink"` media + + Author string `xml:"author>name"` } type atomText struct { @@ -90,6 +92,7 @@ func ParseAtom(r io.Reader) (*Feed, error) { Content: firstNonEmpty(srcitem.Content.String(), srcitem.Summary.String(), srcitem.firstMediaDescription()), ImageURL: srcitem.firstMediaThumbnail(), AudioURL: "", + Author: srcitem.Author, }) } return dstfeed, nil diff --git a/src/parser/json.go b/src/parser/json.go index 3c8a4ada..b81c4092 100644 --- a/src/parser/json.go +++ b/src/parser/json.go @@ -4,6 +4,7 @@ package parser import ( "encoding/json" "io" + "strings" ) type jsonFeed struct { @@ -23,6 +24,7 @@ type jsonItem struct { DatePublished string `json:"date_published"` DateModified string `json:"date_modified"` Attachments []jsonAttachment `json:"attachments"` + Authors []jsonAuthor `json:"authors"` } type jsonAttachment struct { @@ -33,6 +35,12 @@ type jsonAttachment struct { Duration int `json:"duration_in_seconds"` } +type jsonAuthor struct { + Name string `json:"name"` + URL string `json:"url"` + Avatar string `json:"avatar"` +} + func ParseJSON(data io.Reader) (*Feed, error) { srcfeed := new(jsonFeed) decoder := json.NewDecoder(data) @@ -45,12 +53,17 @@ func ParseJSON(data io.Reader) (*Feed, error) { SiteURL: srcfeed.SiteURL, } for _, srcitem := range srcfeed.Items { + authors := []string{} + for _, v := range srcitem.Authors { + authors = append(authors, v.Name) + } dstfeed.Items = append(dstfeed.Items, Item{ GUID: firstNonEmpty(srcitem.ID, srcitem.URL), Date: dateParse(firstNonEmpty(srcitem.DatePublished, srcitem.DateModified)), URL: srcitem.URL, Title: srcitem.Title, Content: firstNonEmpty(srcitem.HTML, srcitem.Text, srcitem.Summary), + Author: strings.Join(authors, ","), }) } return dstfeed, nil diff --git a/src/parser/models.go b/src/parser/models.go index 7587b778..3aeeec00 100644 --- a/src/parser/models.go +++ b/src/parser/models.go @@ -9,10 +9,11 @@ type Feed struct { } type Item struct { - GUID string - Date time.Time - URL string - Title string + GUID string + Date time.Time + URL string + Title string + Author string Content string ImageURL string diff --git a/src/parser/rdf.go b/src/parser/rdf.go index 6fce6052..21540830 100644 --- a/src/parser/rdf.go +++ b/src/parser/rdf.go @@ -22,6 +22,9 @@ type rdfItem struct { DublinCoreDate string `xml:"http://purl.org/dc/elements/1.1/ date"` ContentEncoded string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"` + + DublinCoreCreator string `xml:"http://purl.org/dc/elements/1.1/ creator"` + Author string `xml:"author"` } func ParseRDF(r io.Reader) (*Feed, error) { @@ -37,12 +40,17 @@ func ParseRDF(r io.Reader) (*Feed, error) { SiteURL: srcfeed.Link, } for _, srcitem := range srcfeed.Items { + author := srcitem.DublinCoreCreator + if len(author) == 0 { + author = srcitem.Author + } dstfeed.Items = append(dstfeed.Items, Item{ GUID: srcitem.Link, URL: srcitem.Link, Date: dateParse(srcitem.DublinCoreDate), Title: srcitem.Title, Content: firstNonEmpty(srcitem.ContentEncoded, srcitem.Description), + Author: author, }) } return dstfeed, nil diff --git a/src/parser/rss.go b/src/parser/rss.go index 9eedfc1e..970f96d6 100644 --- a/src/parser/rss.go +++ b/src/parser/rss.go @@ -34,6 +34,9 @@ type rssItem struct { OrigEnclosureLink string `xml:"http://rssnamespace.org/feedburner/ext/1.0 origEnclosureLink"` media + + DublinCoreCreator string `xml:"http://purl.org/dc/elements/1.1/ creator"` + Author string `xml:"author"` } type rssLink struct { @@ -81,6 +84,11 @@ func ParseRSS(r io.Reader) (*Feed, error) { } } + author := srcitem.DublinCoreCreator + if len(author) == 0 { + author = srcitem.Author + } + dstfeed.Items = append(dstfeed.Items, Item{ GUID: firstNonEmpty(srcitem.GUID, srcitem.Link), Date: dateParse(firstNonEmpty(srcitem.DublinCoreDate, srcitem.PubDate)), @@ -89,6 +97,7 @@ func ParseRSS(r io.Reader) (*Feed, error) { Content: firstNonEmpty(srcitem.ContentEncoded, srcitem.Description), AudioURL: podcastURL, ImageURL: srcitem.firstMediaThumbnail(), + Author: author, }) } return dstfeed, nil diff --git a/src/storage/item.go b/src/storage/item.go index de44dfc3..0fa1ba4c 100644 --- a/src/storage/item.go +++ b/src/storage/item.go @@ -54,6 +54,7 @@ type Item struct { Status ItemStatus `json:"status"` ImageURL *string `json:"image"` AudioURL *string `json:"podcast_url"` + Author string `json:"author"` } type ItemFilter struct { @@ -83,13 +84,14 @@ func (s *Storage) CreateItems(items []Item) bool { insert into items ( guid, feed_id, title, link, date, content, image, podcast_url, - date_arrived, status + date_arrived, status, + author ) - values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) on conflict (feed_id, guid) do nothing`, item.GUID, item.FeedId, item.Title, item.Link, item.Date, item.Content, item.ImageURL, item.AudioURL, - now, UNREAD, + now, UNREAD, item.Author, ) if err != nil { log.Print(err) @@ -194,12 +196,14 @@ func (s *Storage) GetItem(id int64) *Item { err := s.db.QueryRow(` select i.id, i.guid, i.feed_id, i.title, i.link, i.content, - i.date, i.status, i.image, i.podcast_url + i.date, i.status, i.image, i.podcast_url, + i.author from items i where i.id = ? `, id).Scan( &i.Id, &i.GUID, &i.FeedId, &i.Title, &i.Link, &i.Content, &i.Date, &i.Status, &i.ImageURL, &i.AudioURL, + &i.Author, ) if err != nil { log.Print(err) diff --git a/src/worker/crawler.go b/src/worker/crawler.go index 3720a389..18b85148 100644 --- a/src/worker/crawler.go +++ b/src/worker/crawler.go @@ -161,6 +161,7 @@ func ConvertItems(items []parser.Item, feed storage.Feed) []storage.Item { Status: storage.UNREAD, ImageURL: imageURL, AudioURL: audioURL, + Author: item.Author, } } return result From db476cc82a54f4717b9eb75820b1dc715c2edf37 Mon Sep 17 00:00:00 2001 From: Stefano Peluchetti Date: Sun, 20 Feb 2022 05:15:22 +0000 Subject: [PATCH 2/5] Add author search. --- src/storage/item.go | 8 ++++---- src/storage/migration.go | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/storage/item.go b/src/storage/item.go index 0fa1ba4c..5afc6095 100644 --- a/src/storage/item.go +++ b/src/storage/item.go @@ -260,7 +260,7 @@ func (s *Storage) FeedStats() []FeedStat { func (s *Storage) SyncSearch() { rows, err := s.db.Query(` - select id, title, content + select id, title, content, author from items where search_rowid is null; `) @@ -272,14 +272,14 @@ func (s *Storage) SyncSearch() { items := make([]Item, 0) for rows.Next() { var item Item - rows.Scan(&item.Id, &item.Title, &item.Content) + rows.Scan(&item.Id, &item.Title, &item.Content, &item.Author) items = append(items, item) } for _, item := range items { result, err := s.db.Exec(` - insert into search (title, description, content) values (?, "", ?)`, - item.Title, htmlutil.ExtractText(item.Content), + insert into search (title, description, content, author) values (?, "", ?, ?)`, + item.Title, htmlutil.ExtractText(item.Content), item.Author, ) if err != nil { log.Print(err) diff --git a/src/storage/migration.go b/src/storage/migration.go index ab12de1c..a6c8e9c3 100644 --- a/src/storage/migration.go +++ b/src/storage/migration.go @@ -131,7 +131,7 @@ func m01_initial(tx *sql.Tx) error { val blob ); - create virtual table if not exists search using fts4(title, description, content); + create virtual table if not exists search using fts4(title, description, content, author); create trigger if not exists del_item_search after delete on items begin delete from search where rowid = old.search_rowid; From a77bdf93d9f4f9af38c54cee6bb310647e6f4aeb Mon Sep 17 00:00:00 2001 From: Stefano Peluchetti Date: Sun, 20 Feb 2022 05:15:41 +0000 Subject: [PATCH 3/5] Strip HTML from author. --- src/worker/crawler.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/worker/crawler.go b/src/worker/crawler.go index 18b85148..395e91d7 100644 --- a/src/worker/crawler.go +++ b/src/worker/crawler.go @@ -11,6 +11,7 @@ import ( "net/url" "strings" + "github.com/nkanaev/yarr/src/content/htmlutil" "github.com/nkanaev/yarr/src/content/scraper" "github.com/nkanaev/yarr/src/parser" "github.com/nkanaev/yarr/src/storage" @@ -161,7 +162,7 @@ func ConvertItems(items []parser.Item, feed storage.Feed) []storage.Item { Status: storage.UNREAD, ImageURL: imageURL, AudioURL: audioURL, - Author: item.Author, + Author: htmlutil.ExtractText(item.Author), } } return result From 2bfe12e0577ab32f066f48bccd2c536a8e99871d Mon Sep 17 00:00:00 2001 From: Stefano Peluchetti Date: Sun, 20 Feb 2022 05:32:33 +0000 Subject: [PATCH 4/5] Fix JSON author concat. --- src/parser/json.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parser/json.go b/src/parser/json.go index b81c4092..de3b28b5 100644 --- a/src/parser/json.go +++ b/src/parser/json.go @@ -63,7 +63,7 @@ func ParseJSON(data io.Reader) (*Feed, error) { URL: srcitem.URL, Title: srcitem.Title, Content: firstNonEmpty(srcitem.HTML, srcitem.Text, srcitem.Summary), - Author: strings.Join(authors, ","), + Author: strings.Join(authors, ", "), }) } return dstfeed, nil From 2f06fb987d715f7207bb1322748c07ad73dac58f Mon Sep 17 00:00:00 2001 From: Stefano Peluchetti Date: Sun, 20 Feb 2022 05:37:44 +0000 Subject: [PATCH 5/5] Add tests. --- src/parser/atom_test.go | 1 + src/parser/json_test.go | 16 ++++++++++++++-- src/parser/rdf_test.go | 3 ++- src/parser/rss_test.go | 2 ++ 4 files changed, 19 insertions(+), 3 deletions(-) diff --git a/src/parser/atom_test.go b/src/parser/atom_test.go index fa01297b..c556b383 100644 --- a/src/parser/atom_test.go +++ b/src/parser/atom_test.go @@ -47,6 +47,7 @@ func TestAtom(t *testing.T) { Content: `

This is the entry content.

`, ImageURL: "", AudioURL: "", + Author: "John Doe", }, }, } diff --git a/src/parser/json_test.go b/src/parser/json_test.go index f94fd1b8..60c376f9 100644 --- a/src/parser/json_test.go +++ b/src/parser/json_test.go @@ -16,7 +16,19 @@ func TestJSONFeed(t *testing.T) { { "id": "2", "content_text": "This is a second item.", - "url": "https://example.org/second-item" + "url": "https://example.org/second-item", + "authors": [ + { + "name": "Optional Author 1", + "url": "https://example.org/authors/optional-author", + "avatar": "https://example.org/authors/optional-author/avatar-512x512.png" + }, + { + "name": "Optional Author 2", + "url": "https://example.org/authors/optional-author-2", + "avatar": "https://example.org/authors/optional-author/avatar-512x512.png" + } + ] }, { "id": "1", @@ -29,7 +41,7 @@ func TestJSONFeed(t *testing.T) { Title: "My Example Feed", SiteURL: "https://example.org/", Items: []Item{ - {GUID: "2", Content: "This is a second item.", URL: "https://example.org/second-item"}, + {GUID: "2", Content: "This is a second item.", URL: "https://example.org/second-item", Author: "Optional Author 1, Optional Author 2"}, {GUID: "1", Content: "

Hello, world!

", URL: "https://example.org/initial-post"}, }, } diff --git a/src/parser/rdf_test.go b/src/parser/rdf_test.go index 0a99c251..5e941484 100644 --- a/src/parser/rdf_test.go +++ b/src/parser/rdf_test.go @@ -29,6 +29,7 @@ func TestRDFFeed(t *testing.T) { New Status Updates http://www.mozilla.org/status/ + Doe John, Stelvio Runner @@ -42,7 +43,7 @@ func TestRDFFeed(t *testing.T) { Title: "Mozilla Dot Org", SiteURL: "http://www.mozilla.org", Items: []Item{ - {GUID: "http://www.mozilla.org/status/", URL: "http://www.mozilla.org/status/", Title: "New Status Updates"}, + {GUID: "http://www.mozilla.org/status/", URL: "http://www.mozilla.org/status/", Title: "New Status Updates", Author: "Doe John, Stelvio Runner"}, {GUID: "http://www.mozilla.org/bugs/", URL: "http://www.mozilla.org/bugs/", Title: "Bugzilla Reorganized"}, }, } diff --git a/src/parser/rss_test.go b/src/parser/rss_test.go index e6eb8a10..894f5530 100644 --- a/src/parser/rss_test.go +++ b/src/parser/rss_test.go @@ -20,6 +20,7 @@ func TestRSSFeed(t *testing.T) { Title 1 http://www.scripting.com/one/ Description 1 + Doe John, Stelvio Runner Title 2 @@ -38,6 +39,7 @@ func TestRSSFeed(t *testing.T) { URL: "http://www.scripting.com/one/", Title: "Title 1", Content: "Description 1", + Author: "Doe John, Stelvio Runner", }, { GUID: "http://www.scripting.com/two/",