Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

revert stream / field filtering in sources #1095

Merged
merged 4 commits into from
Nov 26, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -28,20 +28,15 @@
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.collect.Multimap;
import com.google.common.collect.Multimaps;
import io.airbyte.commons.json.Jsons;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;

public class CatalogHelpers {

Expand Down Expand Up @@ -143,80 +138,4 @@ protected static Set<String> getAllFieldNames(JsonNode node) {
return allFieldNames;
}

/**
* @param identifier stream name or field name
* @return if the identifier matches the alphanumeric+underscore requirement for identifiers
*/
public static boolean isValidIdentifier(String identifier) {
// todo (cgardens) - remove $ once mailchimp is fixed.
final String s = identifier.replaceAll("[-_.$]", "");
return StringUtils.isAlphanumeric(s);
}

/**
* @param catalog airbyte catalog
* @return list of stream names in the catalog that are invalid
*/
public static List<String> getInvalidStreamNames(AirbyteCatalog catalog) {
return getInvalidStreamNames(catalog.getStreams().stream().map(AirbyteStream::getName));
}

/**
* @param catalog configured airbyte catalog
* @return list of stream names in the catalog that are invalid
*/
public static List<String> getInvalidStreamNames(ConfiguredAirbyteCatalog catalog) {
return getInvalidStreamNames(catalog.getStreams().stream().map(ConfiguredAirbyteStream::getStream).map(AirbyteStream::getName));
}

private static List<String> getInvalidStreamNames(Stream<String> names) {
return names
.filter(streamName -> !isValidIdentifier(streamName))
.collect(Collectors.toList());
}

/**
* @param catalog airbyte catalog
* @return multimap of stream names to all invalid field names in that stream
*/
public static Multimap<String, String> getInvalidFieldNames(AirbyteCatalog catalog) {
return getInvalidFieldNames(getStreamNameToJsonSchema(catalog));
}

/**
* @param catalog configured airbyte catalog
* @return multimap of stream names to all invalid field names in that stream
*/
public static Multimap<String, String> getInvalidFieldNames(ConfiguredAirbyteCatalog catalog) {
return getInvalidFieldNames(getStreamNameToJsonSchema(catalog));
}

private static Map<String, JsonNode> getStreamNameToJsonSchema(AirbyteCatalog catalog) {
return catalog.getStreams()
.stream()
.collect(Collectors.toMap(AirbyteStream::getName, AirbyteStream::getJsonSchema));
}

private static Map<String, JsonNode> getStreamNameToJsonSchema(ConfiguredAirbyteCatalog catalog) {
return catalog.getStreams()
.stream()
.map(ConfiguredAirbyteStream::getStream)
.collect(Collectors.toMap(AirbyteStream::getName, AirbyteStream::getJsonSchema));
}

private static Multimap<String, String> getInvalidFieldNames(Map<String, JsonNode> streamNameToJsonSchema) {
final Multimap<String, String> streamNameToInvalidFieldNames = Multimaps.newSetMultimap(new HashMap<>(), HashSet::new);

for (final Map.Entry<String, JsonNode> entry : streamNameToJsonSchema.entrySet()) {
final Set<String> invalidFieldNames = getAllFieldNames(entry.getValue())
.stream()
.filter(streamName -> !isValidIdentifier(streamName))
.collect(Collectors.toSet());

streamNameToInvalidFieldNames.putAll(entry.getKey(), invalidFieldNames);
}

return streamNameToInvalidFieldNames;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,14 @@
package io.airbyte.protocol.models;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertIterableEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;

import com.fasterxml.jackson.databind.JsonNode;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import io.airbyte.commons.json.Jsons;
import io.airbyte.commons.resources.MoreResources;
import io.airbyte.protocol.models.Field.JsonSchemaPrimitive;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import org.junit.jupiter.api.Test;

Expand All @@ -62,46 +55,6 @@ void testGetTopLevelFieldNames() {
assertEquals(Sets.newHashSet("name"), actualFieldNames);
}

@Test
void testValidIdentifiers() {
assertTrue(CatalogHelpers.isValidIdentifier("identifier_name"));
assertTrue(CatalogHelpers.isValidIdentifier("iDenTiFieR_name"));
assertTrue(CatalogHelpers.isValidIdentifier("__identifier_name"));
assertTrue(CatalogHelpers.isValidIdentifier("IDENTIFIER_NAME"));
assertTrue(CatalogHelpers.isValidIdentifier("123identifier_name"));
assertTrue(CatalogHelpers.isValidIdentifier("i0d0e0n0t0i0f0i0e0r0n0a0m0e"));
assertTrue(CatalogHelpers.isValidIdentifier("identifiêr"));
assertTrue(CatalogHelpers.isValidIdentifier("a_unicode_name_文"));
assertTrue(CatalogHelpers.isValidIdentifier("identifier__name__"));
assertTrue(CatalogHelpers.isValidIdentifier("identifier-name.weee"));
}

@Test
void testInvalidIdentifiers() {
assertFalse(CatalogHelpers.isValidIdentifier("\"identifier name"));
assertFalse(CatalogHelpers.isValidIdentifier("identifier name"));
assertFalse(CatalogHelpers.isValidIdentifier("identifier%"));
assertFalse(CatalogHelpers.isValidIdentifier("`identifier`"));
assertFalse(CatalogHelpers.isValidIdentifier("'identifier'"));
}

@Test
void testGetInvalidStreamNames() {
final String validStreamName = "Valid_Stream";
final AirbyteStream validStream = new AirbyteStream();
validStream.setName(validStreamName);

final String invalidStreamName = "invalid stream";
AirbyteStream invalidStream = new AirbyteStream();
invalidStream.setName(invalidStreamName);

AirbyteCatalog catalog = new AirbyteCatalog();
catalog.setStreams(List.of(validStream, invalidStream));

List<String> invalidStreamNames = CatalogHelpers.getInvalidStreamNames(catalog);
assertIterableEquals(Collections.singleton(invalidStreamName), invalidStreamNames);
}

@Test
void testGetFieldNames() throws IOException {
JsonNode node = Jsons.deserialize(MoreResources.readResource("valid_schema.json"));
Expand All @@ -111,26 +64,4 @@ void testGetFieldNames() throws IOException {
assertEquals(expectedFieldNames, actualFieldNames);
}

@Test
void testGetInvalidFieldNames() throws IOException {
final String validStreamName = "Valid_Stream";
final AirbyteStream validStream = new AirbyteStream();
validStream.setName(validStreamName);
JsonNode validSchema = Jsons.deserialize(MoreResources.readResource("valid_schema.json"));
validStream.setJsonSchema(validSchema);

final String invalidStreamName = "invalid stream";
AirbyteStream invalidStream = new AirbyteStream();
invalidStream.setName(invalidStreamName);
JsonNode invalidSchema = Jsons.deserialize(MoreResources.readResource("invalid_schema.json"));
invalidStream.setJsonSchema(invalidSchema);

AirbyteCatalog catalog = new AirbyteCatalog();
catalog.setStreams(List.of(validStream, invalidStream));

Multimap<String, String> streamNameToInvalidFieldNames = CatalogHelpers.getInvalidFieldNames(catalog);
assertIterableEquals(Collections.singleton(invalidStreamName), streamNameToInvalidFieldNames.keySet());
assertIterableEquals(ImmutableList.of("\"CZK", "C A D"), streamNameToInvalidFieldNames.get(invalidStreamName));
}

}
13 changes: 0 additions & 13 deletions airbyte-protocol/models/src/test/resources/invalid_schema.json

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
package io.airbyte.workers;

import com.fasterxml.jackson.databind.JsonNode;
import com.google.common.collect.Multimap;
import io.airbyte.commons.io.IOs;
import io.airbyte.commons.io.LineGobbler;
import io.airbyte.commons.json.Jsons;
Expand All @@ -34,13 +33,11 @@
import io.airbyte.protocol.models.AirbyteCatalog;
import io.airbyte.protocol.models.AirbyteMessage;
import io.airbyte.protocol.models.AirbyteMessage.Type;
import io.airbyte.protocol.models.CatalogHelpers;
import io.airbyte.workers.process.IntegrationLauncher;
import io.airbyte.workers.protocols.airbyte.AirbyteStreamFactory;
import io.airbyte.workers.protocols.airbyte.DefaultAirbyteStreamFactory;
import java.io.InputStream;
import java.nio.file.Path;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.TimeUnit;
import org.slf4j.Logger;
Expand Down Expand Up @@ -94,20 +91,6 @@ public OutputAndStatus<StandardDiscoverCatalogOutput> run(final StandardDiscover
return new OutputAndStatus<>(JobStatus.FAILED);
}

List<String> invalidStreamNames = CatalogHelpers.getInvalidStreamNames(catalog.get());

if (!invalidStreamNames.isEmpty()) {
invalidStreamNames.forEach(streamName -> LOGGER.error("Cannot sync invalid stream name: " + streamName));
return new OutputAndStatus<>(JobStatus.FAILED);
}

Multimap<String, String> streamNameToInvalidFieldNames = CatalogHelpers.getInvalidFieldNames(catalog.get());
if (!streamNameToInvalidFieldNames.isEmpty()) {
streamNameToInvalidFieldNames
.forEach((streamName, fieldNames) -> LOGGER.error("Cannot sync invalid field names for stream " + streamName + ": " + fieldNames));
return new OutputAndStatus<>(JobStatus.FAILED);
}

return new OutputAndStatus<>(
JobStatus.SUCCEEDED,
new StandardDiscoverCatalogOutput()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,28 +24,21 @@

package io.airbyte.workers;

import com.google.common.collect.Sets;
import io.airbyte.config.StandardSyncInput;
import io.airbyte.config.StandardSyncOutput;
import io.airbyte.config.StandardSyncSummary;
import io.airbyte.config.StandardTapConfig;
import io.airbyte.config.StandardTargetConfig;
import io.airbyte.config.State;
import io.airbyte.protocol.models.AirbyteMessage;
import io.airbyte.protocol.models.CatalogHelpers;
import io.airbyte.protocol.models.ConfiguredAirbyteCatalog;
import io.airbyte.protocol.models.ConfiguredAirbyteStream;
import io.airbyte.protocol.models.SyncMode;
import io.airbyte.workers.normalization.NormalizationRunner;
import io.airbyte.workers.protocols.Destination;
import io.airbyte.workers.protocols.MessageTracker;
import io.airbyte.workers.protocols.Source;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.stream.Collectors;
import org.slf4j.Logger;
Expand Down Expand Up @@ -82,9 +75,6 @@ public OutputAndStatus<StandardSyncOutput> run(StandardSyncInput syncInput, Path
.stream()
.collect(Collectors.toMap(s -> s.getStream().getName(), s -> s.getSyncMode() != null ? s.getSyncMode() : SyncMode.FULL_REFRESH)));

// clean catalog object
removeInvalidStreams(syncInput.getCatalog());

final StandardTapConfig tapConfig = WorkerUtils.syncToTapConfig(syncInput);
final StandardTargetConfig targetConfig = WorkerUtils.syncToTargetConfig(syncInput);

Expand All @@ -97,12 +87,8 @@ public OutputAndStatus<StandardSyncOutput> run(StandardSyncInput syncInput, Path
if (maybeMessage.isPresent()) {
final AirbyteMessage message = maybeMessage.get();

if (message.getType().equals(AirbyteMessage.Type.RECORD) && !CatalogHelpers.isValidIdentifier(message.getRecord().getStream())) {
LOGGER.error("Filtered out record for invalid stream: " + message.getRecord().getStream());
} else {
messageTracker.accept(message);
destination.accept(message);
}
messageTracker.accept(message);
destination.accept(message);
}
}

Expand Down Expand Up @@ -145,16 +131,4 @@ public void cancel() {
cancelled.set(true);
}

private void removeInvalidStreams(ConfiguredAirbyteCatalog catalog) {
final Set<String> invalidStreams = Sets.union(
new HashSet<>(CatalogHelpers.getInvalidStreamNames(catalog)),
CatalogHelpers.getInvalidFieldNames(catalog).keySet());

final List<ConfiguredAirbyteStream> streams = catalog.getStreams().stream()
.filter(stream -> !invalidStreams.contains(stream.getStream().getName()))
.collect(Collectors.toList());

catalog.setStreams(streams);
}

}
Loading