Skip to content

Commit

Permalink
Merge branch 'development-t' into 'main'
Browse files Browse the repository at this point in the history
Added furs data

See merge request peerdb/peerdb!4
  • Loading branch information
mitar committed Feb 19, 2025
2 parents cb52399 + 9824d2d commit 4f98c31
Show file tree
Hide file tree
Showing 7 changed files with 373 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,4 @@ cmd/moma/testdata/artwork/4170_in.html filter=lfs diff=lfs merge=lfs -text
cmd/moma/testdata/artwork/4170_out.json filter=lfs diff=lfs merge=lfs -text
cmd/moma/testdata/artwork/9_in.html filter=lfs diff=lfs merge=lfs -text
cmd/moma/testdata/artwork/9_out.json filter=lfs diff=lfs merge=lfs -text
cmd/products/testdata/DURS_zavezanci_DEJ.txt filter=lfs diff=lfs merge=lfs -text
1 change: 1 addition & 0 deletions cmd/products/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,5 @@ type Config struct {
Natureta Natureta `embed:"" prefix:"natureta."`
LjubljanskeMlekarne LjubljanskeMlekarne `embed:"" prefix:"ljubljanskemlekarne."`
Zito Zito `embed:"" prefix:"zito."`
FURSDEJ FURSDEJ `embed:"" prefix:"fursdej."`
}
272 changes: 272 additions & 0 deletions cmd/products/furs.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,272 @@
package main

import (
"archive/zip"
"bufio"
"context"
"encoding/json"
"html"
"io"
"strings"

"github.com/hashicorp/go-retryablehttp"
"github.com/krolaw/zipstream"
"github.com/rs/zerolog"
"gitlab.com/tozd/go/errors"
"gitlab.com/tozd/go/x"

"gitlab.com/peerdb/peerdb"
"gitlab.com/peerdb/peerdb/document"
"gitlab.com/peerdb/peerdb/internal/es"
"gitlab.com/peerdb/peerdb/internal/indexer"
"gitlab.com/peerdb/peerdb/internal/types"
"gitlab.com/peerdb/peerdb/store"
)

const (
fursDEJURL = "http://www.datoteke.fu.gov.si/DURS_zavezanci_DEJ.zip"
)

type FURSDEJ struct {
Disabled bool `default:"false" help:"Do not import FURS DEJ data. Default: false."`
}

type FursEntry struct {
VATNumber string `json:"idVatNo"`
RegistrationNumber string `json:"idRegNo"`
SKD string `json:"skd"`
Name string `json:"company"`
Address string `json:"address"`
FinancialOffice string `json:"financialOffice"`
}

func makeFursDoc(furs FursEntry) (document.D, errors.E) {
doc := document.D{
CoreDocument: document.CoreDocument{
ID: document.GetID(NameSpaceProducts, "FURS", furs.RegistrationNumber),
Score: document.LowConfidence,
},
Claims: &document.ClaimTypes{
Identifier: document.IdentifierClaims{
{
CoreClaim: document.CoreClaim{
ID: document.GetID(NameSpaceProducts, "FURS", furs.RegistrationNumber, "COMPANY_REGISTRATION_NUMBER", 0),
Confidence: document.HighConfidence,
},
Prop: document.GetCorePropertyReference("COMPANY_REGISTRATION_NUMBER"),
Value: furs.RegistrationNumber,
},
{
CoreClaim: document.CoreClaim{
ID: document.GetID(NameSpaceProducts, "FURS", furs.RegistrationNumber, "VAT_NUMBER", 0),
Confidence: document.HighConfidence,
},
Prop: document.GetCorePropertyReference("VAT_NUMBER"),
Value: furs.VATNumber,
},
},
Relation: document.RelationClaims{
{
CoreClaim: document.CoreClaim{
ID: document.GetID(NameSpaceProducts, "FURS", furs.Name, "TYPE", 0, "NAME", 0),
Confidence: document.HighConfidence,
},
Prop: document.GetCorePropertyReference("TYPE"),
To: document.GetCorePropertyReference("COMPANY"),
},
},
Text: document.TextClaims{
{
CoreClaim: document.CoreClaim{
ID: document.GetID(NameSpaceProducts, "FURS", furs.RegistrationNumber, "NAME", 0),
Confidence: document.HighConfidence,
},
Prop: document.GetCorePropertyReference("NAME"),
HTML: document.TranslatableHTMLString{"en": html.EscapeString(furs.Name)},
},
{
CoreClaim: document.CoreClaim{
ID: document.GetID(NameSpaceProducts, "FURS", furs.RegistrationNumber, "ADDRESS", 0),
Confidence: document.HighConfidence,
},
Prop: document.GetCorePropertyReference("ADDRESS"),
HTML: document.TranslatableHTMLString{"en": html.EscapeString(furs.Address)},
},
},
String: document.StringClaims{
{
CoreClaim: document.CoreClaim{
ID: document.GetID(NameSpaceProducts, "FURS", furs.RegistrationNumber, "FINANCIAL_OFFICE", 0),
Confidence: document.HighConfidence,
},
Prop: document.GetCorePropertyReference("FINANCIAL_OFFICE"),
String: furs.FinancialOffice,
},
{
CoreClaim: document.CoreClaim{
ID: document.GetID(NameSpaceProducts, "FURS", furs.RegistrationNumber, "COUNTRY_OF_INCORPORATION", 0),
Confidence: document.HighConfidence,
},
Prop: document.GetCorePropertyReference("COUNTRY_OF_INCORPORATION"),
String: "Slovenia",
},
},
},
}

var errE errors.E
if s := strings.TrimSpace(furs.SKD); s != "" {
errE = doc.Add(&document.StringClaim{
CoreClaim: document.CoreClaim{
ID: document.GetID(NameSpaceProducts, "FURS", furs.RegistrationNumber, "SKD_2025", 0),
Confidence: document.HighConfidence,
},
Prop: document.GetCorePropertyReference("SKD_2025"),
String: s,
})
if errE != nil {
return doc, errE
}
}
return doc, nil
}

func (d FURSDEJ) Run(
ctx context.Context,
config *Config,
httpClient *retryablehttp.Client,
store *store.Store[json.RawMessage, *types.DocumentMetadata, *types.NoMetadata, *types.NoMetadata, *types.NoMetadata, document.Changes],
indexingCount, indexingSize *x.Counter,
) errors.E {
if d.Disabled {
return nil
}

records, errE := downloadFurs(ctx, httpClient, config.Logger, config.CacheDir, fursDEJURL)
if errE != nil {
return errE
}

config.Logger.Info().Int("count", len(records)).Msg("retrieved FURS DEJ data")

description := "FURS DEJ processing"
progress := es.Progress(config.Logger, nil, nil, nil, description)
indexingSize.Add(int64(len(records)))

count := x.Counter(0)
ticker := x.NewTicker(ctx, &count, x.NewCounter(int64(len(records))), indexer.ProgressPrintRate)
defer ticker.Stop()
go func() {
for p := range ticker.C {
progress(ctx, p)
}
}()

for i, record := range records {
if err := ctx.Err(); err != nil { // Check if context is canceled.
return errors.WithStack(err)
}
config.Logger.Debug().
Int("index", i).
Str("id", record.RegistrationNumber).
Msg("processing company record")

doc, errE := makeFursDoc(record)

if errE != nil {
errors.Details(errE)["id"] = record.RegistrationNumber
return errE
}

count.Increment()
indexingCount.Increment()

config.Logger.Debug().Str("doc", doc.ID.String()).Msg("saving document")
errE = peerdb.InsertOrReplaceDocument(ctx, store, &doc)
if errE != nil {
errors.Details(errE)["id"] = record.RegistrationNumber
return errE
}
}
config.Logger.Info().
Int64("count", count.Count()).
Int("total", len(records)).
Msg(description + " done")

return nil
}

func downloadFurs(ctx context.Context, httpClient *retryablehttp.Client, logger zerolog.Logger, cacheDir, url string) ([]FursEntry, errors.E) {
reader, _, errE := indexer.CachedDownload(ctx, httpClient, logger, cacheDir, url)
if errE != nil {
return nil, errE
}
defer reader.Close()

zipReader := zipstream.NewReader(reader)
var file *zip.FileHeader
var err error
for file, err = zipReader.Next(); err == nil; file, err = zipReader.Next() {
if file.Name == "DURS_zavezanci_DEJ.txt" {
records, errE := processFursDejFile(zipReader)
if errE != nil {
return nil, errE
}
return records, nil
}
}

if errors.Is(err, io.EOF) {
return nil, errors.New(`"DURS_zavezanci_DEJ.txt not found in ZIP"`)
}

return nil, errors.WithStack(err)
}

// trimAndExtract extracts a substring from a fixed-width text line.
func trimAndExtract(line string, start, end int) string {
if len(line) < end {
return "" // Prevent out-of-bounds errors.
}
return strings.TrimSpace(line[start:end])
}

// processFursDejFile reads and processes the in-memory file from ZIP.
func processFursDejFile(reader io.Reader) ([]FursEntry, errors.E) {
scanner := bufio.NewScanner(reader)
var records []FursEntry

for scanner.Scan() {
line := scanner.Text()

zero := 0
firstCol := 8
secondCol := 19
thirdCol := 26
fourthCol := 127
fifthCol := 241
// Extract fields based on fixed positions.
col1 := trimAndExtract(line, zero, firstCol)
col2 := trimAndExtract(line, (firstCol + 1), secondCol)
col3 := trimAndExtract(line, (secondCol + 1), thirdCol)
col4 := trimAndExtract(line, (thirdCol + 1), fourthCol)
col5 := trimAndExtract(line, (fourthCol + 1), fifthCol)
col6 := line[len(line)-2:]

// Append valid record.
records = append(records, FursEntry{
VATNumber: col1,
RegistrationNumber: col2,
SKD: col3,
Name: col4,
Address: col5,
FinancialOffice: col6,
})
}

if err := scanner.Err(); err != nil {
return nil, errors.WithStack(err)
}

return records, nil
}
48 changes: 48 additions & 0 deletions cmd/products/furs_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
package main

import (
"bufio"
"embed"
"testing"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

//go:embed testdata
var content embed.FS

func TestProcessFursDejFile(t *testing.T) {
t.Parallel()

file, err := content.Open("testdata/DURS_zavezanci_DEJ.txt")
if err != nil {
require.NoError(t, err)
}
t.Cleanup(func() { file.Close() })

readFile := bufio.NewReader(file)

records, errE := processFursDejFile(readFile)
if errE != nil {
require.NoError(t, errE, "% -+#.1v", errE)
}
require.Len(t, records, 8, "Expected 8 records, but got %d", len(records))

assert.NotEmpty(t, records)

// Check the problematic record, if SKD is an empty string.
assert.Equal(t, "", records[5].SKD, "SKD mismatch")

for record := range records {
assert.Len(t, records[record].VATNumber, 8, "VATNumber should be 8 characters long")
assert.Len(t, records[record].RegistrationNumber, 10, "RegistrationNumber should be 10 characters long")
assert.Len(t, records[record].FinancialOffice, 2, "FinancialOffice should be 2 characters long")
assert.NotEmpty(t, records[record].Name, "Name should not be empty")
assert.NotEmpty(t, records[record].Address, "Address should not be empty")
if record == 5 || record == 6 {
continue
}
assert.Len(t, records[record].SKD, 6, "SKD should be 'XX.XXX' 6 characters long")
}
}
4 changes: 4 additions & 0 deletions cmd/products/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@ func index(config *Config) errors.E {
return config.Zito.Run(ctx, config, httpClient, store, indexingCount, indexingSize)
})

g.Go(func() error {
return config.FURSDEJ.Run(ctx, config, httpClient, store, indexingCount, indexingSize)
})

errE = errors.WithStack(g.Wait())
if errE != nil {
return errE
Expand Down
44 changes: 44 additions & 0 deletions cmd/products/properties.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,50 @@ var productsProperties = []struct {
`A description of packaging of a branded food product.`,
[]string{`"text" claim type`},
},

// FursEntry specific properties start here.
{
"VAT number",
nil,
`A company VAT number.`,
[]string{`"identifier" claim type`},
},
{
"Company registration number",
nil,
`A company registration number.`,
[]string{`"identifier" claim type`},
},
{
"SKD 2025",
[]string{"Standard Classification of Activities 2025"},
`National Standard Classification of Activities in Slovenia extending NACE Rev. 2.1..`,
[]string{`"string" claim type`},
},
{
"company",
nil,
"A document is about a company.",
[]string{`item`},
},
{
"address",
nil,
`An address.`,
[]string{`"text" claim type`},
},
{
"financial office",
nil,
`A financial office responsible for the company.`,
[]string{`"string" claim type`},
},
{
"country of incorporation",
nil,
`Country of incorporation.`,
[]string{`"string" claim type`},
},
}

func init() { //nolint:gochecknoinits
Expand Down
3 changes: 3 additions & 0 deletions cmd/products/testdata/DURS_zavezanci_DEJ.txt
Git LFS file not shown

0 comments on commit 4f98c31

Please sign in to comment.