learningequality · MCGallaspy · Mar 25, 2016 · Mar 25, 2016 · Mar 25, 2016 · Mar 25, 2016
diff --git a/kalite/main/tests/base.py b/kalite/main/tests/base.py
@@ -11,7 +11,7 @@
 from django.core.cache.backends.locmem import LocMemCache
 
 from kalite.testing.base import KALiteTestCase
-from kalite.topic_tools.content_models import get_random_content
+from kalite.topic_tools.content_models import get_random_content, update_item
 from securesync.models import Device
 
 
@@ -56,5 +56,6 @@ def create_random_content_file(self):
         fake_content_file = os.path.join(settings.CONTENT_ROOT, "%s.mp4" % youtube_id)
         with open(fake_content_file, "w") as fh:
             fh.write("")
+        update_item(update={"files_complete": 1, "available": True, "size_on_disk": 12}, path=content["path"])
         self.assertTrue(os.path.exists(fake_content_file), "Make sure the content file was created, youtube_id='%s'." % youtube_id)
         return (fake_content_file, content["id"], youtube_id, path)
diff --git a/kalite/testing/base_environment.py b/kalite/testing/base_environment.py
@@ -67,8 +67,10 @@ def setup_content_paths(context, db):
     # It then updates the items with these paths with their update dicts, and then propagates
     # availability changes up the topic tree - this means that we can alter the availability of one item
     # and make all its parent topics available so that it is navigable to in integration tests.
-    annotate_content_models(db=db, iterator_content_items=lambda ids: [(
-        context.available_content_path, {"available": True})])
+    def iterator_content_items(ids=None, channel="khan", language="en"):
+        return [(context.available_content_path, {"available": True})]
+
+    annotate_content_models(db=db, iterator_content_items=iterator_content_items)
 
     with Using(db, [Item], with_transaction=False):
         context._unavailable_item = Item.create(

diff --git a/kalite/topic_tools/annotate.py b/kalite/topic_tools/annotate.py
@@ -47,6 +47,8 @@ def update_content_availability(content_list, language="en", channel="khan"):
                     else:
                         subtitle_langs[filename] = [lc]
 
+    subtitle_language_dir = language.replace("-", "_")
+
     for content in content_list:
         # Some nodes are duplicated, but they require the same information
         # regardless of where they appear in the topic tree
@@ -64,41 +66,51 @@ def update_content_availability(content_list, language="en", channel="khan"):
             # Ignore topics, as we only want to update their availability after we have updated the rest.
             continue
         else:
-            file_id = content.get("youtube_id", content.get("id"))
+            file_id = content.get("youtube_id")
             default_thumbnail = create_thumbnail_url(content.get("id"))
             format = content.get("format", "")
-            filename = file_id + "." + format
+            filename = file_id + "." + format if file_id else None
 
             # Get list of subtitle language codes currently available
             subtitle_lang_codes = subtitle_langs.get("{id}.vtt".format(id=content.get("id")), [])
 
-            if filename in contents_folder or language in subtitle_lang_codes:
-                if (filename not in contents_folder) and language in subtitle_lang_codes:
-                    # The file is not available, but it might be available in English and can be subtitled
-                    if content.get("id") + "." + format in contents_folder:
-                        file_id = content.get("id")
-                        filename = file_id + "." + format
-                    else:
-                        file_id = None
-                else:
-                    # File for this language is available and downloaded, so let's stamp the file size on it!
-                    update["size_on_disk"] = get_local_video_size(content.get("youtube_id"))
-                if file_id:
-                    update["available"] = True
-                    thumbnail = create_thumbnail_url(file_id) or default_thumbnail
-                    update["content_urls"] = {
-                        "stream": django_settings.CONTENT_URL + filename,
-                        "stream_type": "{kind}/{format}".format(kind=content.get("kind").lower(), format=format),
-                        "thumbnail": thumbnail,
-                    }
-                elif django_settings.BACKUP_VIDEO_SOURCE:
-                    file_id = content.get("youtube_id", content.get("id"))
-                    update["available"] = True
-                    update["content_urls"] = {
-                        "stream": django_settings.BACKUP_VIDEO_SOURCE.format(youtube_id=file_id, video_format=format),
-                        "stream_type": "{kind}/{format}".format(kind=content.get("kind").lower(), format=format),
-                        "thumbnail": django_settings.BACKUP_VIDEO_SOURCE.format(youtube_id=file_id, video_format="png"),
-                    }
+            if filename and filename in contents_folder:
+                update["files_complete"] = 1
+                # File for this language is available and downloaded, so let's stamp the file size on it!
+                update["size_on_disk"] = get_local_video_size(content.get("youtube_id"))
+            else:
+                # The video file for this content item does not exist. Set the files_complete and size_on_disk to 0
+                if content.get("files_complete"):
+                    update["files_complete"] = 0
+                if content.get("size_on_disk"):
+                    update["size_on_disk"] = 0
+                # Set file_id to None as a flag that this file should not be used in any later processing.
+                file_id = None
+
+            if not file_id and subtitle_language_dir in subtitle_lang_codes and (content.get("id") + "." + format in contents_folder):
+                # The file is not available in this language, but it is available in English and can be subtitled
+                file_id = content.get("id")
+                filename = file_id + "." + format
+
+            if file_id:
+                # We have a valid file_id (i.e. some file that we can use is available locally)
+                update["available"] = True
+                thumbnail = create_thumbnail_url(file_id) or default_thumbnail
+                update["content_urls"] = {
+                    "stream": django_settings.CONTENT_URL + filename,
+                    "stream_type": "{kind}/{format}".format(kind=content.get("kind").lower(), format=format),
+                    "thumbnail": thumbnail,
+                }
+            elif django_settings.BACKUP_VIDEO_SOURCE:
+                # If the file is not available locally, but we are running the demo server, we want to serve the files
+                # from the Internet.
+                file_id = content.get("youtube_id", content.get("id"))
+                update["available"] = True
+                update["content_urls"] = {
+                    "stream": django_settings.BACKUP_VIDEO_SOURCE.format(youtube_id=file_id, video_format=format),
+                    "stream_type": "{kind}/{format}".format(kind=content.get("kind").lower(), format=format),
+                    "thumbnail": django_settings.BACKUP_VIDEO_SOURCE.format(youtube_id=file_id, video_format="png"),
+                }
 
             if update.get("available"):
                 # Don't bother doing this work if the video is not available at all
@@ -113,12 +125,9 @@ def update_content_availability(content_list, language="en", channel="khan"):
                 # Sort all subtitle URLs by language code
                 update["subtitle_urls"] = sorted(subtitle_urls, key=lambda x: x.get("code", ""))
 
-                update["files_complete"] = 1
-
         # Content is currently flagged as available, but is not. Flag as unavailable.
         if content.get("available") and "available" not in update:
             update["available"] = False
-            update["files_complete"] = 0
-            update["size_on_disk"] = 0
+
 
         yield content.get("path"), update
diff --git a/kalite/topic_tools/content_models.py b/kalite/topic_tools/content_models.py
@@ -395,19 +395,25 @@ def get_topic_contents(kinds=None, topic_id=None, **kwargs):
 
 
 @set_database
-def get_download_youtube_ids(paths=None, **kwargs):
+def get_download_youtube_ids(paths=None, downloaded=False, **kwargs):
     """
     Convenience function for taking a list of content ids and returning
     all associated youtube_ids for downloads, regardless of whether the input
     paths are paths for content nodes or topic nodes
     :param paths: A list of paths to nodes - used to ensure uniqueness.
+    :param downloaded: Boolean to select whether to return files that have been downloaded already or not.
     :return: A unique list of youtube_ids as strings.
     """
     if paths:
         youtube_ids = dict()
         for path in paths:
             selector = (Item.kind != "Topic") & (Item.path.contains(path)) & (Item.youtube_id.is_null(False))
 
+            if downloaded:
+                selector &= Item.files_complete > 0
+            else:
+                selector &= Item.files_complete == 0
+
             youtube_ids.update(dict([item for item in Item.select(Item.youtube_id, Item.title).where(selector).tuples() if item[0]]))
 
         return youtube_ids
@@ -587,7 +593,7 @@ def update_item(update=None, path=None, **kwargs):
             item.save()
 
 
-def iterator_content_items(ids=None, **kwargs):
+def iterator_content_items(ids=None, channel="khan", language="en", **kwargs):
     """
     Generator to iterate over content items specified by ids,
     run update content availability on that item and then yield the
@@ -601,13 +607,13 @@ def iterator_content_items(ids=None, **kwargs):
         items = Item.select().dicts().iterator()
 
     mapped_items = itertools.imap(unparse_model_data, items)
-    updated_mapped_items = update_content_availability(mapped_items)
+    updated_mapped_items = update_content_availability(mapped_items, channel=channel, language=language)
 
     for path, update in updated_mapped_items:
         yield path, update
 
 
-def iterator_content_items_by_youtube_id(ids=None, **kwargs):
+def iterator_content_items_by_youtube_id(ids=None, channel="khan", language="en", **kwargs):
     """
     Generator to iterate over content items specified by youtube ids,
     run update content availability on that item and then yield the
@@ -621,7 +627,7 @@ def iterator_content_items_by_youtube_id(ids=None, **kwargs):
         items = Item.select().dicts().iterator()
 
     mapped_items = itertools.imap(unparse_model_data, items)
-    updated_mapped_items = update_content_availability(mapped_items)
+    updated_mapped_items = update_content_availability(mapped_items, channel=channel, language=language)
 
     for path, update in updated_mapped_items:
         yield path, update
@@ -661,7 +667,7 @@ def annotate_content_models(channel="khan", language="en", ids=None, iterator_co
 
     db = kwargs.get("db")
     if db:
-        content_models = iterator_content_items(ids=ids)
+        content_models = iterator_content_items(ids=ids, channel=channel, language=language)
         with db.atomic() as transaction:
             def recurse_availability_up_tree(node, available):
                 if not node.parent:

diff --git a/kalite/updates/api_views.py b/kalite/updates/api_views.py
@@ -128,7 +128,7 @@ def start_video_download(request):
 
     lang = json.loads(request.body or "{}").get("lang", "en")
 
-    youtube_ids = get_download_youtube_ids(paths, language=lang)
+    youtube_ids = get_download_youtube_ids(paths, language=lang, downloaded=False)
 
     queue = VideoQueue()
 
@@ -150,7 +150,7 @@ def delete_videos(request):
 
     lang = json.loads(request.body or "{}").get("lang", "en")
 
-    youtube_ids = get_download_youtube_ids(paths, language=lang)
+    youtube_ids = get_download_youtube_ids(paths, language=lang, downloaded=True)
 
     num_deleted = 0
 

diff --git a/kalite/updates/management/commands/videodownload.py b/kalite/updates/management/commands/videodownload.py
@@ -138,34 +138,39 @@ def handle(self, *args, **options):
                 try:
 
                     progress_callback = partial(self.download_progress_callback, video)
-                    try:
-                        # Download via urllib
-                        download_video(video.get("youtube_id"), callback=progress_callback)
-
-                    except URLNotFound:
-                        # Video was not found on amazon cloud service,
-                        #   either due to a KA mistake, or due to the fact
-                        #   that it's a dubbed video.
-                        #
-                        # We can use youtube-dl to get that video!!
-                        logging.debug(_("Retrieving youtube video %(youtube_id)s via youtube-dl") % {"youtube_id": video.get("youtube_id")})
-
-                        def youtube_dl_cb(stats, progress_callback, *args, **kwargs):
-                            if stats['status'] == "finished":
-                                percent = 100.
-                            elif stats['status'] == "downloading":
-                                percent = 100. * stats['downloaded_bytes'] / stats['total_bytes']
-                            else:
-                                percent = 0.
-                            progress_callback(percent=percent)
-                        scrape_video(video.get("youtube_id"), quiet=not settings.DEBUG, callback=partial(youtube_dl_cb, progress_callback=progress_callback))
-
-                    except IOError as e:
-                        logging.exception(e)
-                        failed_youtube_ids.append(video.get("youtube_id"))
-                        video_queue.remove_file(video.get("youtube_id"))
-                        time.sleep(10)
-                        continue
+
+                    # Don't try to download a file that already exists in the content dir - just say it was successful
+                    # and call it a day!
+                    if not os.path.exists(os.path.join(settings.CONTENT_ROOT, "{id}.mp4".format(id=video.get("youtube_id")))):
+
+                        try:
+                            # Download via urllib
+                            download_video(video.get("youtube_id"), callback=progress_callback)
+
+                        except URLNotFound:
+                            # Video was not found on amazon cloud service,
+                            #   either due to a KA mistake, or due to the fact
+                            #   that it's a dubbed video.
+                            #
+                            # We can use youtube-dl to get that video!!
+                            logging.debug(_("Retrieving youtube video %(youtube_id)s via youtube-dl") % {"youtube_id": video.get("youtube_id")})
+
+                            def youtube_dl_cb(stats, progress_callback, *args, **kwargs):
+                                if stats['status'] == "finished":
+                                    percent = 100.
+                                elif stats['status'] == "downloading":
+                                    percent = 100. * stats['downloaded_bytes'] / stats['total_bytes']
+                                else:
+                                    percent = 0.
+                                progress_callback(percent=percent)
+                            scrape_video(video.get("youtube_id"), quiet=not settings.DEBUG, callback=partial(youtube_dl_cb, progress_callback=progress_callback))
+
+                        except IOError as e:
+                            logging.exception(e)
+                            failed_youtube_ids.append(video.get("youtube_id"))
+                            video_queue.remove_file(video.get("youtube_id"))
+                            time.sleep(10)
+                            continue
 
                     # If we got here, we downloaded ... somehow :)
                     handled_youtube_ids.append(video.get("youtube_id"))