Skip to content

Commit

Permalink
refactor(proto): add doc-level uri out of oneof content
Browse files Browse the repository at this point in the history
  • Loading branch information
hanxiao committed Jun 1, 2020
1 parent 35aa9e5 commit 66dfe75
Show file tree
Hide file tree
Showing 41 changed files with 171 additions and 210 deletions.
2 changes: 1 addition & 1 deletion jina/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

# do not change this line manually
# this is managed by proto/build-proto.sh and updated on every execution
__proto_version__ = '0.0.29'
__proto_version__ = '0.0.30'

import platform
import sys
Expand Down
36 changes: 29 additions & 7 deletions jina/clients/python/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,18 @@
__license__ = "Apache-2.0"

import ctypes
import mimetypes
import os
import random
import urllib.parse
from typing import Iterator, Union

import numpy as np

from ...drivers.helper import array2pb
from ...drivers.helper import array2pb, guess_mime
from ...enums import ClientMode
from ...helper import batch_iterator
from ...logging import default_logger
from ...proto import jina_pb2


Expand All @@ -20,6 +22,19 @@ def _generate(data: Union[Iterator[bytes], Iterator['jina_pb2.Document'], Iterat
random_doc_id: bool = False, mode: ClientMode = ClientMode.INDEX, top_k: int = 50,
mime_type: str = None,
*args, **kwargs) -> Iterator['jina_pb2.Message']:
buffer_sniff = False

try:
import magic
buffer_sniff = True
except (ImportError, ModuleNotFoundError):
default_logger.warning(f'can not sniff the MIME type '
f'MIME sniffing requires pip install "jina[http]" '
f'and brew install libmagic (Mac)/ apt-get install libmagic1 (Linux)')

if mime_type and (mime_type not in mimetypes.types_map.values()):
mime_type = mimetypes.guess_type(f'*.{mime_type}')[0]

if isinstance(mode, str):
mode = ClientMode.from_string(mode)

Expand All @@ -41,19 +56,26 @@ def _generate(data: Union[Iterator[bytes], Iterator['jina_pb2.Document'], Iterat
d.blob.CopyFrom(array2pb(_raw))
elif isinstance(_raw, bytes):
d.buffer = _raw
if mime_type:
d.mime_type = mime_type
if not mime_type and buffer_sniff:
try:
import magic
mime_type = magic.from_buffer(_raw, mime=True)
except Exception as ex:
default_logger.warning(f'can not sniff the MIME type due to the exception {ex}')
elif isinstance(_raw, str):
scheme = urllib.parse.urlparse(_raw).scheme
if scheme in {'http', 'https'} or os.path.exists(_raw) or os.access(os.path.dirname(_raw), os.W_OK):
d.file_path = _raw
elif scheme == 'data':
d.data_uri = _raw
if (scheme in {'http', 'https', 'data'} or os.path.exists(_raw)
or os.access(os.path.dirname(_raw), os.W_OK)):
d.uri = _raw
mime_type = guess_mime(_raw)
else:
d.text = _raw
else:
raise TypeError(f'{type(_raw)} type of input is not supported')

if mime_type:
d.mime_type = mime_type

d.doc_id = first_doc_id if not random_doc_id else random.randint(0, ctypes.c_uint(-1).value)
d.weight = 1.0
first_doc_id += 1
Expand Down
42 changes: 20 additions & 22 deletions jina/drivers/craft.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,9 @@

import ctypes
import random
import urllib.parse
import urllib.request

from . import BaseExecutableDriver, BaseDriver
from .helper import array2pb, pb_obj2dict, pb2array
from .helper import array2pb, pb_obj2dict, pb2array, guess_mime
from ..proto import jina_pb2


Expand Down Expand Up @@ -100,36 +98,36 @@ def __init__(self, default_mime: str = 'application/octet-stream', *args, **kwar
"""
super().__init__(*args, **kwargs)
self.default_mime = default_mime
self.buffer_sniff = False
try:
import magic
self.buffer_sniff = True
except (ImportError, ModuleNotFoundError):
self.logger.warning(f'can not sniff the MIME type '
f'MIME sniffing requires pip install "jina[http]" '
f'and brew install libmagic (Mac)/ apt-get install libmagic1 (Linux)')

def __call__(self, *args, **kwargs):
import mimetypes

for d in self.req.docs:
# mime_type may be a file extension
m_type = d.mime_type
if m_type and m_type not in mimetypes.types_map.values():
if m_type and (m_type not in mimetypes.types_map.values()):
m_type = mimetypes.guess_type(f'*.{m_type}')[0]

d_type = d.WhichOneof('content')
if not m_type and d_type: # for ClientInputType=PROTO, d_type could be empty
if not m_type: # for ClientInputType=PROTO, d_type could be empty
d_type = d.WhichOneof('content')
if d_type == 'buffer':
d_content = getattr(d, d_type)
# d.mime_type = 'application/octet-stream' # default by IANA standard
try:
import magic
m_type = magic.from_buffer(d_content, mime=True)
except (ImportError, ModuleNotFoundError):
self.logger.warning(f'can not sniff the MIME type '
f'MIME sniffing requires pip install "jina[http]" '
f'and brew install libmagic (Mac)/ apt-get install libmagic1 (Linux)')
except Exception as ex:
self.logger.warning(f'can not sniff the MIME type due to the exception {ex}')
elif d_type in {'file_path', 'data_uri'}:
d_content = getattr(d, d_type)
m_type = mimetypes.guess_type(d_content)[0]
if not m_type and urllib.parse.urlparse(d_content).scheme in {'http', 'https', 'data'}:
tmp = urllib.request.urlopen(d_content)
m_type = tmp.info().get_content_type()
if self.buffer_sniff:
try:
import magic
m_type = magic.from_buffer(d_content, mime=True)
except Exception as ex:
self.logger.warning(f'can not sniff the MIME type due to the exception {ex}')
if d.uri:
m_type = guess_mime(d.uri)

if m_type:
d.mime_type = m_type
Expand Down
13 changes: 13 additions & 0 deletions jina/drivers/helper.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
__copyright__ = "Copyright (c) 2020 Jina AI Limited. All rights reserved."
__license__ = "Apache-2.0"

import mimetypes
import os
import urllib.parse
import urllib.request
from typing import Dict, Any, Iterable, Tuple

import numpy as np
Expand Down Expand Up @@ -141,3 +144,13 @@ def pb_obj2dict(obj, keys: Iterable[str]) -> Dict[str, Any]:
:param keys: an iterable of keys for extraction
"""
return {k: getattr(obj, k) for k in keys if hasattr(obj, k)}


def guess_mime(uri):
# guess when uri points to a local file
m_type = mimetypes.guess_type(uri)[0]
# guess when uri points to a remote file
if not m_type and urllib.parse.urlparse(uri).scheme in {'http', 'https', 'data'}:
tmp = urllib.request.urlopen(uri)
m_type = tmp.info().get_content_type()
return m_type
7 changes: 0 additions & 7 deletions jina/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,10 +221,3 @@ class ClientMode(BetterEnum):
SEARCH = 1
TRAIN = 2


class ClientInputType(BetterEnum):
""" The input mode of the client"""
BUFFER = 0
DATA_URI = 1
PROTOBUF = 2
FILE_PATH = 3
56 changes: 18 additions & 38 deletions jina/executors/crafters/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,46 +13,26 @@
from . import BaseDocCrafter


class FilePath2Buffer(BaseDocCrafter):
class URI2Buffer(BaseDocCrafter):
""" Convert local file path, remote URL doc to a buffer doc.
"""

def craft(self, file_path: str, *args, **kwargs):
if urllib.parse.urlparse(file_path).scheme in {'http', 'https', 'data'}:
page = urllib.request.Request(file_path, headers={'User-Agent': 'Mozilla/5.0'})
def craft(self, uri: str, *args, **kwargs):
if urllib.parse.urlparse(uri).scheme in {'http', 'https', 'data'}:
page = urllib.request.Request(uri, headers={'User-Agent': 'Mozilla/5.0'})
tmp = urllib.request.urlopen(page)
buffer = tmp.read()
elif os.path.exists(file_path):
with open(file_path, 'rb') as fp:
elif os.path.exists(uri):
with open(uri, 'rb') as fp:
buffer = fp.read()
else:
raise FileNotFoundError(f'{file_path} is not a URL or a valid local path')
raise FileNotFoundError(f'{uri} is not a URL or a valid local path')
return dict(buffer=buffer)


class DataURI2Buffer(FilePath2Buffer):
""" Convert a data URI doc to a buffer doc.
"""

def craft(self, data_uri: str, *args, **kwargs):
return super().craft(data_uri)


class PathURI2Buffer(DataURI2Buffer):
def craft(self, file_path: str, data_uri: str, buffer: bytes, *args, **kwargs):
if buffer:
pass
elif file_path:
return FilePath2Buffer.craft(self, file_path)
elif data_uri:
return DataURI2Buffer.craft(self, data_uri)
else:
raise ValueError('this document has no "file_path", no "data_uri" and no "buffer" set')


class FilePath2DataURI(FilePath2Buffer):
class Path2DataURI(URI2Buffer):
def __init__(self, charset: str = 'utf-8', base64: bool = False, *args, **kwargs):
""" Convert file path doc to data uri doc.
""" Convert file path doc to data uri doc. Internally it first reads into buffer and then converts it to data URI.
:param charset: charset may be any character set registered with IANA
:param base64: used to encode arbitrary octet sequences into a form that satisfies the rules of 7bit. Designed to be efficient for non-text 8 bit and binary data. Sometimes used for text data that frequently uses non-US-ASCII characters.
Expand All @@ -63,9 +43,9 @@ def __init__(self, charset: str = 'utf-8', base64: bool = False, *args, **kwargs
self.charset = charset
self.base64 = base64

def craft(self, file_path: str, mime_type: str, *args, **kwargs):
d = super().craft(file_path)
return dict(data_uri=self.make_datauri(mime_type, d['buffer']))
def craft(self, uri: str, mime_type: str, *args, **kwargs):
d = super().craft(uri)
return dict(uri=self.make_datauri(mime_type, d['buffer']))

def make_datauri(self, mimetype, buffer):
parts = ['data:', mimetype]
Expand All @@ -82,23 +62,23 @@ def make_datauri(self, mimetype, buffer):
return ''.join(parts)


class Buffer2DataURI(FilePath2DataURI):
class Buffer2DataURI(Path2DataURI):
"""Convert buffer to data URI"""

def craft(self, buffer: bytes, mime_type: str, *args, **kwargs):
return dict(data_uri=self.make_datauri(mime_type, buffer))
return dict(uri=self.make_datauri(mime_type, buffer))


class Buffer2NdArray(BaseDocCrafter):
"""Convert buffer to numpy array"""

def craft(self, buffer, *args, **kwargs):
def craft(self, buffer: bytes, *args, **kwargs):
return dict(blob=np.frombuffer(buffer))


class Blob2PNGDataURI(FilePath2DataURI):
class Blob2PNGDataURI(BaseDocCrafter):
"""Simple DocCrafter used in :command:`jina hello-world`,
it reads ``buffer`` into base64 png and stored in ``data_uri``"""
it reads ``buffer`` into base64 png and stored in ``uri``"""

def __init__(self, width: int = 28, height: int = 28, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand Down Expand Up @@ -128,4 +108,4 @@ def png_pack(png_tag, data):
png_pack(b'IHDR', struct.pack('!2I5B', self.width, self.height, 8, 6, 0, 0, 0)),
png_pack(b'IDAT', zlib.compress(raw_data, 9)),
png_pack(b'IEND', b'')])
return dict(data_uri='data:image/png;base64,' + base64.b64encode(png_bytes).decode())
return dict(uri='data:image/png;base64,' + base64.b64encode(png_bytes).decode())
4 changes: 2 additions & 2 deletions jina/helloworld/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@

def print_result(resp):
for d in resp.search.docs:
vi = d.data_uri
vi = d.uri
result_html.append(f'<tr><td><img src="{vi}"/></td><td>')
for kk in d.topk_results:
kmi = kk.match_doc.data_uri
kmi = kk.match_doc.uri
result_html.append(f'<img src="{kmi}" style="opacity:{kk.score.value}"/>')
# k['score']['explained'] = json.loads(kk.score.explained)
result_html.append('</td></tr>\n')
Expand Down
9 changes: 3 additions & 6 deletions jina/proto/jina.proto
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,6 @@ message Document {
// the raw binary content of this document, which often represents the original document when comes into jina
bytes buffer = 3;

// a data uri document
string data_uri = 9;

// a local file path, or a remote url starts with http or https points to a document
string file_path = 11;

// the ndarray of the image/audio/video document
NdArray blob = 12;

Expand All @@ -134,6 +128,9 @@ message Document {

// mime type of this document, for buffer content, this is required; for other contents, this can be guessed
string mime_type = 10;

// a uri of the document could be: a local file path, a remote url starts with http or https or data URI scheme
string uri = 9;
}

/**
Expand Down
Loading

0 comments on commit 66dfe75

Please sign in to comment.