refactor(proto): add doc-level uri out of oneof content

jina-ai · Jun 1, 2020 · 66dfe75 · 66dfe75
1 parent 35aa9e5
commit 66dfe75
Show file tree

Hide file tree

Showing 41 changed files with 171 additions and 210 deletions.
diff --git a/jina/__init__.py b/jina/__init__.py
@@ -7,7 +7,7 @@
 
 # do not change this line manually
 # this is managed by proto/build-proto.sh and updated on every execution
-__proto_version__ = '0.0.29'
+__proto_version__ = '0.0.30'
 
 import platform
 import sys

diff --git a/jina/clients/python/request.py b/jina/clients/python/request.py
@@ -2,16 +2,18 @@
 __license__ = "Apache-2.0"
 
 import ctypes
+import mimetypes
 import os
 import random
 import urllib.parse
 from typing import Iterator, Union
 
 import numpy as np
 
-from ...drivers.helper import array2pb
+from ...drivers.helper import array2pb, guess_mime
 from ...enums import ClientMode
 from ...helper import batch_iterator
+from ...logging import default_logger
 from ...proto import jina_pb2
 
 
@@ -20,6 +22,19 @@ def _generate(data: Union[Iterator[bytes], Iterator['jina_pb2.Document'], Iterat
               random_doc_id: bool = False, mode: ClientMode = ClientMode.INDEX, top_k: int = 50,
               mime_type: str = None,
               *args, **kwargs) -> Iterator['jina_pb2.Message']:
+    buffer_sniff = False
+
+    try:
+        import magic
+        buffer_sniff = True
+    except (ImportError, ModuleNotFoundError):
+        default_logger.warning(f'can not sniff the MIME type '
+                               f'MIME sniffing requires pip install "jina[http]" '
+                               f'and brew install libmagic (Mac)/ apt-get install libmagic1 (Linux)')
+
+    if mime_type and (mime_type not in mimetypes.types_map.values()):
+        mime_type = mimetypes.guess_type(f'*.{mime_type}')[0]
+
     if isinstance(mode, str):
         mode = ClientMode.from_string(mode)
 
@@ -41,19 +56,26 @@ def _generate(data: Union[Iterator[bytes], Iterator['jina_pb2.Document'], Iterat
                 d.blob.CopyFrom(array2pb(_raw))
             elif isinstance(_raw, bytes):
                 d.buffer = _raw
-                if mime_type:
-                    d.mime_type = mime_type
+                if not mime_type and buffer_sniff:
+                    try:
+                        import magic
+                        mime_type = magic.from_buffer(_raw, mime=True)
+                    except Exception as ex:
+                        default_logger.warning(f'can not sniff the MIME type due to the exception {ex}')
             elif isinstance(_raw, str):
                 scheme = urllib.parse.urlparse(_raw).scheme
-                if scheme in {'http', 'https'} or os.path.exists(_raw) or os.access(os.path.dirname(_raw), os.W_OK):
-                    d.file_path = _raw
-                elif scheme == 'data':
-                    d.data_uri = _raw
+                if (scheme in {'http', 'https', 'data'} or os.path.exists(_raw)
+                        or os.access(os.path.dirname(_raw), os.W_OK)):
+                    d.uri = _raw
+                    mime_type = guess_mime(_raw)
                 else:
                     d.text = _raw
             else:
                 raise TypeError(f'{type(_raw)} type of input is not supported')
 
+            if mime_type:
+                d.mime_type = mime_type
+
             d.doc_id = first_doc_id if not random_doc_id else random.randint(0, ctypes.c_uint(-1).value)
             d.weight = 1.0
             first_doc_id += 1

diff --git a/jina/drivers/craft.py b/jina/drivers/craft.py
@@ -3,11 +3,9 @@
 
 import ctypes
 import random
-import urllib.parse
-import urllib.request
 
 from . import BaseExecutableDriver, BaseDriver
-from .helper import array2pb, pb_obj2dict, pb2array
+from .helper import array2pb, pb_obj2dict, pb2array, guess_mime
 from ..proto import jina_pb2
 
 
@@ -100,36 +98,36 @@ def __init__(self, default_mime: str = 'application/octet-stream', *args, **kwar
         """
         super().__init__(*args, **kwargs)
         self.default_mime = default_mime
+        self.buffer_sniff = False
+        try:
+            import magic
+            self.buffer_sniff = True
+        except (ImportError, ModuleNotFoundError):
+            self.logger.warning(f'can not sniff the MIME type '
+                                f'MIME sniffing requires pip install "jina[http]" '
+                                f'and brew install libmagic (Mac)/ apt-get install libmagic1 (Linux)')
 
     def __call__(self, *args, **kwargs):
         import mimetypes
 
         for d in self.req.docs:
             # mime_type may be a file extension
             m_type = d.mime_type
-            if m_type and m_type not in mimetypes.types_map.values():
+            if m_type and (m_type not in mimetypes.types_map.values()):
                 m_type = mimetypes.guess_type(f'*.{m_type}')[0]
 
-            d_type = d.WhichOneof('content')
-            if not m_type and d_type:  # for ClientInputType=PROTO, d_type could be empty
+            if not m_type:  # for ClientInputType=PROTO, d_type could be empty
+                d_type = d.WhichOneof('content')
                 if d_type == 'buffer':
                     d_content = getattr(d, d_type)
-                    # d.mime_type = 'application/octet-stream'  # default by IANA standard
-                    try:
-                        import magic
-                        m_type = magic.from_buffer(d_content, mime=True)
-                    except (ImportError, ModuleNotFoundError):
-                        self.logger.warning(f'can not sniff the MIME type '
-                                            f'MIME sniffing requires pip install "jina[http]" '
-                                            f'and brew install libmagic (Mac)/ apt-get install libmagic1 (Linux)')
-                    except Exception as ex:
-                        self.logger.warning(f'can not sniff the MIME type due to the exception {ex}')
-                elif d_type in {'file_path', 'data_uri'}:
-                    d_content = getattr(d, d_type)
-                    m_type = mimetypes.guess_type(d_content)[0]
-                    if not m_type and urllib.parse.urlparse(d_content).scheme in {'http', 'https', 'data'}:
-                        tmp = urllib.request.urlopen(d_content)
-                        m_type = tmp.info().get_content_type()
+                    if self.buffer_sniff:
+                        try:
+                            import magic
+                            m_type = magic.from_buffer(d_content, mime=True)
+                        except Exception as ex:
+                            self.logger.warning(f'can not sniff the MIME type due to the exception {ex}')
+                if d.uri:
+                    m_type = guess_mime(d.uri)
 
             if m_type:
                 d.mime_type = m_type

diff --git a/jina/drivers/helper.py b/jina/drivers/helper.py
@@ -1,7 +1,10 @@
 __copyright__ = "Copyright (c) 2020 Jina AI Limited. All rights reserved."
 __license__ = "Apache-2.0"
 
+import mimetypes
 import os
+import urllib.parse
+import urllib.request
 from typing import Dict, Any, Iterable, Tuple
 
 import numpy as np
@@ -141,3 +144,13 @@ def pb_obj2dict(obj, keys: Iterable[str]) -> Dict[str, Any]:
     :param keys: an iterable of keys for extraction
     """
     return {k: getattr(obj, k) for k in keys if hasattr(obj, k)}
+
+
+def guess_mime(uri):
+    # guess when uri points to a local file
+    m_type = mimetypes.guess_type(uri)[0]
+    # guess when uri points to a remote file
+    if not m_type and urllib.parse.urlparse(uri).scheme in {'http', 'https', 'data'}:
+        tmp = urllib.request.urlopen(uri)
+        m_type = tmp.info().get_content_type()
+    return m_type
diff --git a/jina/enums.py b/jina/enums.py
@@ -221,10 +221,3 @@ class ClientMode(BetterEnum):
     SEARCH = 1
     TRAIN = 2
 
-
-class ClientInputType(BetterEnum):
-    """ The input mode of the client"""
-    BUFFER = 0
-    DATA_URI = 1
-    PROTOBUF = 2
-    FILE_PATH = 3
diff --git a/jina/executors/crafters/convert.py b/jina/executors/crafters/convert.py
@@ -13,46 +13,26 @@
 from . import BaseDocCrafter
 
 
-class FilePath2Buffer(BaseDocCrafter):
+class URI2Buffer(BaseDocCrafter):
     """ Convert local file path, remote URL doc to a buffer doc.
     """
 
-    def craft(self, file_path: str, *args, **kwargs):
-        if urllib.parse.urlparse(file_path).scheme in {'http', 'https', 'data'}:
-            page = urllib.request.Request(file_path, headers={'User-Agent': 'Mozilla/5.0'})
+    def craft(self, uri: str, *args, **kwargs):
+        if urllib.parse.urlparse(uri).scheme in {'http', 'https', 'data'}:
+            page = urllib.request.Request(uri, headers={'User-Agent': 'Mozilla/5.0'})
             tmp = urllib.request.urlopen(page)
             buffer = tmp.read()
-        elif os.path.exists(file_path):
-            with open(file_path, 'rb') as fp:
+        elif os.path.exists(uri):
+            with open(uri, 'rb') as fp:
                 buffer = fp.read()
         else:
-            raise FileNotFoundError(f'{file_path} is not a URL or a valid local path')
+            raise FileNotFoundError(f'{uri} is not a URL or a valid local path')
         return dict(buffer=buffer)
 
 
-class DataURI2Buffer(FilePath2Buffer):
-    """ Convert a data URI doc to a buffer doc.
-    """
-
-    def craft(self, data_uri: str, *args, **kwargs):
-        return super().craft(data_uri)
-
-
-class PathURI2Buffer(DataURI2Buffer):
-    def craft(self, file_path: str, data_uri: str, buffer: bytes, *args, **kwargs):
-        if buffer:
-            pass
-        elif file_path:
-            return FilePath2Buffer.craft(self, file_path)
-        elif data_uri:
-            return DataURI2Buffer.craft(self, data_uri)
-        else:
-            raise ValueError('this document has no "file_path", no "data_uri" and no "buffer" set')
-
-
-class FilePath2DataURI(FilePath2Buffer):
+class Path2DataURI(URI2Buffer):
     def __init__(self, charset: str = 'utf-8', base64: bool = False, *args, **kwargs):
-        """ Convert file path doc to data uri doc.
+        """ Convert file path doc to data uri doc. Internally it first reads into buffer and then converts it to data URI.
 
         :param charset: charset may be any character set registered with IANA
         :param base64: used to encode arbitrary octet sequences into a form that satisfies the rules of 7bit. Designed to be efficient for non-text 8 bit and binary data. Sometimes used for text data that frequently uses non-US-ASCII characters.
@@ -63,9 +43,9 @@ def __init__(self, charset: str = 'utf-8', base64: bool = False, *args, **kwargs
         self.charset = charset
         self.base64 = base64
 
-    def craft(self, file_path: str, mime_type: str, *args, **kwargs):
-        d = super().craft(file_path)
-        return dict(data_uri=self.make_datauri(mime_type, d['buffer']))
+    def craft(self, uri: str, mime_type: str, *args, **kwargs):
+        d = super().craft(uri)
+        return dict(uri=self.make_datauri(mime_type, d['buffer']))
 
     def make_datauri(self, mimetype, buffer):
         parts = ['data:', mimetype]
@@ -82,23 +62,23 @@ def make_datauri(self, mimetype, buffer):
         return ''.join(parts)
 
 
-class Buffer2DataURI(FilePath2DataURI):
+class Buffer2DataURI(Path2DataURI):
     """Convert buffer to data URI"""
 
     def craft(self, buffer: bytes, mime_type: str, *args, **kwargs):
-        return dict(data_uri=self.make_datauri(mime_type, buffer))
+        return dict(uri=self.make_datauri(mime_type, buffer))
 
 
 class Buffer2NdArray(BaseDocCrafter):
     """Convert buffer to numpy array"""
 
-    def craft(self, buffer, *args, **kwargs):
+    def craft(self, buffer: bytes, *args, **kwargs):
         return dict(blob=np.frombuffer(buffer))
 
 
-class Blob2PNGDataURI(FilePath2DataURI):
+class Blob2PNGDataURI(BaseDocCrafter):
     """Simple DocCrafter used in :command:`jina hello-world`,
-        it reads ``buffer`` into base64 png and stored in ``data_uri``"""
+        it reads ``buffer`` into base64 png and stored in ``uri``"""
 
     def __init__(self, width: int = 28, height: int = 28, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -128,4 +108,4 @@ def png_pack(png_tag, data):
             png_pack(b'IHDR', struct.pack('!2I5B', self.width, self.height, 8, 6, 0, 0, 0)),
             png_pack(b'IDAT', zlib.compress(raw_data, 9)),
             png_pack(b'IEND', b'')])
-        return dict(data_uri='data:image/png;base64,' + base64.b64encode(png_bytes).decode())
+        return dict(uri='data:image/png;base64,' + base64.b64encode(png_bytes).decode())
diff --git a/jina/helloworld/helper.py b/jina/helloworld/helper.py
@@ -18,10 +18,10 @@
 
 def print_result(resp):
     for d in resp.search.docs:
-        vi = d.data_uri
+        vi = d.uri
         result_html.append(f'<tr><td><img src="{vi}"/></td><td>')
         for kk in d.topk_results:
-            kmi = kk.match_doc.data_uri
+            kmi = kk.match_doc.uri
             result_html.append(f'<img src="{kmi}" style="opacity:{kk.score.value}"/>')
             # k['score']['explained'] = json.loads(kk.score.explained)
         result_html.append('</td></tr>\n')

diff --git a/jina/proto/jina.proto b/jina/proto/jina.proto
@@ -104,12 +104,6 @@ message Document {
         // the raw binary content of this document, which often represents the original document when comes into jina
         bytes buffer = 3;
 
-        // a data uri document
-        string data_uri = 9;
-
-        // a local file path, or a remote url starts with http or https points to a document
-        string file_path = 11;
-
        // the ndarray of the image/audio/video document
         NdArray blob = 12;
 
@@ -134,6 +128,9 @@ message Document {
 
     // mime type of this document, for buffer content, this is required; for other contents, this can be guessed
     string mime_type = 10;
+
+    // a uri of the document could be: a local file path, a remote url starts with http or https or data URI scheme
+    string uri = 9;
 }
 
 /**