diff --git a/.gitattributes b/.gitattributes index 3ece147..412baec 100644 --- a/.gitattributes +++ b/.gitattributes @@ -69,3 +69,4 @@ cmd/moma/testdata/artwork/4170_in.html filter=lfs diff=lfs merge=lfs -text cmd/moma/testdata/artwork/4170_out.json filter=lfs diff=lfs merge=lfs -text cmd/moma/testdata/artwork/9_in.html filter=lfs diff=lfs merge=lfs -text cmd/moma/testdata/artwork/9_out.json filter=lfs diff=lfs merge=lfs -text +cmd/products/testdata/DURS_zavezanci_DEJ.txt filter=lfs diff=lfs merge=lfs -text diff --git a/cmd/products/config.go b/cmd/products/config.go index 3d2d927..4ec4a7c 100644 --- a/cmd/products/config.go +++ b/cmd/products/config.go @@ -37,4 +37,5 @@ type Config struct { Natureta Natureta `embed:"" prefix:"natureta."` LjubljanskeMlekarne LjubljanskeMlekarne `embed:"" prefix:"ljubljanskemlekarne."` Zito Zito `embed:"" prefix:"zito."` + FURSDEJ FURSDEJ `embed:"" prefix:"fursdej."` } diff --git a/cmd/products/furs.go b/cmd/products/furs.go new file mode 100644 index 0000000..fc7aef7 --- /dev/null +++ b/cmd/products/furs.go @@ -0,0 +1,272 @@ +package main + +import ( + "archive/zip" + "bufio" + "context" + "encoding/json" + "html" + "io" + "strings" + + "github.com/hashicorp/go-retryablehttp" + "github.com/krolaw/zipstream" + "github.com/rs/zerolog" + "gitlab.com/tozd/go/errors" + "gitlab.com/tozd/go/x" + + "gitlab.com/peerdb/peerdb" + "gitlab.com/peerdb/peerdb/document" + "gitlab.com/peerdb/peerdb/internal/es" + "gitlab.com/peerdb/peerdb/internal/indexer" + "gitlab.com/peerdb/peerdb/internal/types" + "gitlab.com/peerdb/peerdb/store" +) + +const ( + fursDEJURL = "http://www.datoteke.fu.gov.si/DURS_zavezanci_DEJ.zip" +) + +type FURSDEJ struct { + Disabled bool `default:"false" help:"Do not import FURS DEJ data. Default: false."` +} + +type FursEntry struct { + VATNumber string `json:"idVatNo"` + RegistrationNumber string `json:"idRegNo"` + SKD string `json:"skd"` + Name string `json:"company"` + Address string `json:"address"` + FinancialOffice string `json:"financialOffice"` +} + +func makeFursDoc(furs FursEntry) (document.D, errors.E) { + doc := document.D{ + CoreDocument: document.CoreDocument{ + ID: document.GetID(NameSpaceProducts, "FURS", furs.RegistrationNumber), + Score: document.LowConfidence, + }, + Claims: &document.ClaimTypes{ + Identifier: document.IdentifierClaims{ + { + CoreClaim: document.CoreClaim{ + ID: document.GetID(NameSpaceProducts, "FURS", furs.RegistrationNumber, "COMPANY_REGISTRATION_NUMBER", 0), + Confidence: document.HighConfidence, + }, + Prop: document.GetCorePropertyReference("COMPANY_REGISTRATION_NUMBER"), + Value: furs.RegistrationNumber, + }, + { + CoreClaim: document.CoreClaim{ + ID: document.GetID(NameSpaceProducts, "FURS", furs.RegistrationNumber, "VAT_NUMBER", 0), + Confidence: document.HighConfidence, + }, + Prop: document.GetCorePropertyReference("VAT_NUMBER"), + Value: furs.VATNumber, + }, + }, + Relation: document.RelationClaims{ + { + CoreClaim: document.CoreClaim{ + ID: document.GetID(NameSpaceProducts, "FURS", furs.Name, "TYPE", 0, "NAME", 0), + Confidence: document.HighConfidence, + }, + Prop: document.GetCorePropertyReference("TYPE"), + To: document.GetCorePropertyReference("COMPANY"), + }, + }, + Text: document.TextClaims{ + { + CoreClaim: document.CoreClaim{ + ID: document.GetID(NameSpaceProducts, "FURS", furs.RegistrationNumber, "NAME", 0), + Confidence: document.HighConfidence, + }, + Prop: document.GetCorePropertyReference("NAME"), + HTML: document.TranslatableHTMLString{"en": html.EscapeString(furs.Name)}, + }, + { + CoreClaim: document.CoreClaim{ + ID: document.GetID(NameSpaceProducts, "FURS", furs.RegistrationNumber, "ADDRESS", 0), + Confidence: document.HighConfidence, + }, + Prop: document.GetCorePropertyReference("ADDRESS"), + HTML: document.TranslatableHTMLString{"en": html.EscapeString(furs.Address)}, + }, + }, + String: document.StringClaims{ + { + CoreClaim: document.CoreClaim{ + ID: document.GetID(NameSpaceProducts, "FURS", furs.RegistrationNumber, "FINANCIAL_OFFICE", 0), + Confidence: document.HighConfidence, + }, + Prop: document.GetCorePropertyReference("FINANCIAL_OFFICE"), + String: furs.FinancialOffice, + }, + { + CoreClaim: document.CoreClaim{ + ID: document.GetID(NameSpaceProducts, "FURS", furs.RegistrationNumber, "COUNTRY_OF_INCORPORATION", 0), + Confidence: document.HighConfidence, + }, + Prop: document.GetCorePropertyReference("COUNTRY_OF_INCORPORATION"), + String: "Slovenia", + }, + }, + }, + } + + var errE errors.E + if s := strings.TrimSpace(furs.SKD); s != "" { + errE = doc.Add(&document.StringClaim{ + CoreClaim: document.CoreClaim{ + ID: document.GetID(NameSpaceProducts, "FURS", furs.RegistrationNumber, "SKD_2025", 0), + Confidence: document.HighConfidence, + }, + Prop: document.GetCorePropertyReference("SKD_2025"), + String: s, + }) + if errE != nil { + return doc, errE + } + } + return doc, nil +} + +func (d FURSDEJ) Run( + ctx context.Context, + config *Config, + httpClient *retryablehttp.Client, + store *store.Store[json.RawMessage, *types.DocumentMetadata, *types.NoMetadata, *types.NoMetadata, *types.NoMetadata, document.Changes], + indexingCount, indexingSize *x.Counter, +) errors.E { + if d.Disabled { + return nil + } + + records, errE := downloadFurs(ctx, httpClient, config.Logger, config.CacheDir, fursDEJURL) + if errE != nil { + return errE + } + + config.Logger.Info().Int("count", len(records)).Msg("retrieved FURS DEJ data") + + description := "FURS DEJ processing" + progress := es.Progress(config.Logger, nil, nil, nil, description) + indexingSize.Add(int64(len(records))) + + count := x.Counter(0) + ticker := x.NewTicker(ctx, &count, x.NewCounter(int64(len(records))), indexer.ProgressPrintRate) + defer ticker.Stop() + go func() { + for p := range ticker.C { + progress(ctx, p) + } + }() + + for i, record := range records { + if err := ctx.Err(); err != nil { // Check if context is canceled. + return errors.WithStack(err) + } + config.Logger.Debug(). + Int("index", i). + Str("id", record.RegistrationNumber). + Msg("processing company record") + + doc, errE := makeFursDoc(record) + + if errE != nil { + errors.Details(errE)["id"] = record.RegistrationNumber + return errE + } + + count.Increment() + indexingCount.Increment() + + config.Logger.Debug().Str("doc", doc.ID.String()).Msg("saving document") + errE = peerdb.InsertOrReplaceDocument(ctx, store, &doc) + if errE != nil { + errors.Details(errE)["id"] = record.RegistrationNumber + return errE + } + } + config.Logger.Info(). + Int64("count", count.Count()). + Int("total", len(records)). + Msg(description + " done") + + return nil +} + +func downloadFurs(ctx context.Context, httpClient *retryablehttp.Client, logger zerolog.Logger, cacheDir, url string) ([]FursEntry, errors.E) { + reader, _, errE := indexer.CachedDownload(ctx, httpClient, logger, cacheDir, url) + if errE != nil { + return nil, errE + } + defer reader.Close() + + zipReader := zipstream.NewReader(reader) + var file *zip.FileHeader + var err error + for file, err = zipReader.Next(); err == nil; file, err = zipReader.Next() { + if file.Name == "DURS_zavezanci_DEJ.txt" { + records, errE := processFursDejFile(zipReader) + if errE != nil { + return nil, errE + } + return records, nil + } + } + + if errors.Is(err, io.EOF) { + return nil, errors.New(`"DURS_zavezanci_DEJ.txt not found in ZIP"`) + } + + return nil, errors.WithStack(err) +} + +// trimAndExtract extracts a substring from a fixed-width text line. +func trimAndExtract(line string, start, end int) string { + if len(line) < end { + return "" // Prevent out-of-bounds errors. + } + return strings.TrimSpace(line[start:end]) +} + +// processFursDejFile reads and processes the in-memory file from ZIP. +func processFursDejFile(reader io.Reader) ([]FursEntry, errors.E) { + scanner := bufio.NewScanner(reader) + var records []FursEntry + + for scanner.Scan() { + line := scanner.Text() + + zero := 0 + firstCol := 8 + secondCol := 19 + thirdCol := 26 + fourthCol := 127 + fifthCol := 241 + // Extract fields based on fixed positions. + col1 := trimAndExtract(line, zero, firstCol) + col2 := trimAndExtract(line, (firstCol + 1), secondCol) + col3 := trimAndExtract(line, (secondCol + 1), thirdCol) + col4 := trimAndExtract(line, (thirdCol + 1), fourthCol) + col5 := trimAndExtract(line, (fourthCol + 1), fifthCol) + col6 := line[len(line)-2:] + + // Append valid record. + records = append(records, FursEntry{ + VATNumber: col1, + RegistrationNumber: col2, + SKD: col3, + Name: col4, + Address: col5, + FinancialOffice: col6, + }) + } + + if err := scanner.Err(); err != nil { + return nil, errors.WithStack(err) + } + + return records, nil +} diff --git a/cmd/products/furs_test.go b/cmd/products/furs_test.go new file mode 100644 index 0000000..03c8ce2 --- /dev/null +++ b/cmd/products/furs_test.go @@ -0,0 +1,48 @@ +package main + +import ( + "bufio" + "embed" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +//go:embed testdata +var content embed.FS + +func TestProcessFursDejFile(t *testing.T) { + t.Parallel() + + file, err := content.Open("testdata/DURS_zavezanci_DEJ.txt") + if err != nil { + require.NoError(t, err) + } + t.Cleanup(func() { file.Close() }) + + readFile := bufio.NewReader(file) + + records, errE := processFursDejFile(readFile) + if errE != nil { + require.NoError(t, errE, "% -+#.1v", errE) + } + require.Len(t, records, 8, "Expected 8 records, but got %d", len(records)) + + assert.NotEmpty(t, records) + + // Check the problematic record, if SKD is an empty string. + assert.Equal(t, "", records[5].SKD, "SKD mismatch") + + for record := range records { + assert.Len(t, records[record].VATNumber, 8, "VATNumber should be 8 characters long") + assert.Len(t, records[record].RegistrationNumber, 10, "RegistrationNumber should be 10 characters long") + assert.Len(t, records[record].FinancialOffice, 2, "FinancialOffice should be 2 characters long") + assert.NotEmpty(t, records[record].Name, "Name should not be empty") + assert.NotEmpty(t, records[record].Address, "Address should not be empty") + if record == 5 || record == 6 { + continue + } + assert.Len(t, records[record].SKD, 6, "SKD should be 'XX.XXX' 6 characters long") + } +} diff --git a/cmd/products/index.go b/cmd/products/index.go index a56811f..90af79c 100644 --- a/cmd/products/index.go +++ b/cmd/products/index.go @@ -58,6 +58,10 @@ func index(config *Config) errors.E { return config.Zito.Run(ctx, config, httpClient, store, indexingCount, indexingSize) }) + g.Go(func() error { + return config.FURSDEJ.Run(ctx, config, httpClient, store, indexingCount, indexingSize) + }) + errE = errors.WithStack(g.Wait()) if errE != nil { return errE diff --git a/cmd/products/properties.go b/cmd/products/properties.go index b741216..fb86a46 100644 --- a/cmd/products/properties.go +++ b/cmd/products/properties.go @@ -131,6 +131,50 @@ var productsProperties = []struct { `A description of packaging of a branded food product.`, []string{`"text" claim type`}, }, + + // FursEntry specific properties start here. + { + "VAT number", + nil, + `A company VAT number.`, + []string{`"identifier" claim type`}, + }, + { + "Company registration number", + nil, + `A company registration number.`, + []string{`"identifier" claim type`}, + }, + { + "SKD 2025", + []string{"Standard Classification of Activities 2025"}, + `National Standard Classification of Activities in Slovenia extending NACE Rev. 2.1..`, + []string{`"string" claim type`}, + }, + { + "company", + nil, + "A document is about a company.", + []string{`item`}, + }, + { + "address", + nil, + `An address.`, + []string{`"text" claim type`}, + }, + { + "financial office", + nil, + `A financial office responsible for the company.`, + []string{`"string" claim type`}, + }, + { + "country of incorporation", + nil, + `Country of incorporation.`, + []string{`"string" claim type`}, + }, } func init() { //nolint:gochecknoinits diff --git a/cmd/products/testdata/DURS_zavezanci_DEJ.txt b/cmd/products/testdata/DURS_zavezanci_DEJ.txt new file mode 100644 index 0000000..c4e6e3a --- /dev/null +++ b/cmd/products/testdata/DURS_zavezanci_DEJ.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:641541a633a2148a737c9f11eae556961db0f0218d8edd568fade913558a147c +size 1971