Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Availability #5045

Merged
merged 6 commits into from
Mar 25, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion kalite/main/tests/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from django.core.cache.backends.locmem import LocMemCache

from kalite.testing.base import KALiteTestCase
from kalite.topic_tools.content_models import get_random_content
from kalite.topic_tools.content_models import get_random_content, update_item
from securesync.models import Device


Expand Down Expand Up @@ -56,5 +56,6 @@ def create_random_content_file(self):
fake_content_file = os.path.join(settings.CONTENT_ROOT, "%s.mp4" % youtube_id)
with open(fake_content_file, "w") as fh:
fh.write("")
update_item(update={"files_complete": 1, "available": True, "size_on_disk": 12}, path=content["path"])
self.assertTrue(os.path.exists(fake_content_file), "Make sure the content file was created, youtube_id='%s'." % youtube_id)
return (fake_content_file, content["id"], youtube_id, path)
6 changes: 4 additions & 2 deletions kalite/testing/base_environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,10 @@ def setup_content_paths(context, db):
# It then updates the items with these paths with their update dicts, and then propagates
# availability changes up the topic tree - this means that we can alter the availability of one item
# and make all its parent topics available so that it is navigable to in integration tests.
annotate_content_models(db=db, iterator_content_items=lambda ids: [(
context.available_content_path, {"available": True})])
def iterator_content_items(ids=None, channel="khan", language="en"):
return [(context.available_content_path, {"available": True})]

annotate_content_models(db=db, iterator_content_items=iterator_content_items)

with Using(db, [Item], with_transaction=False):
context._unavailable_item = Item.create(
Expand Down
75 changes: 42 additions & 33 deletions kalite/topic_tools/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ def update_content_availability(content_list, language="en", channel="khan"):
else:
subtitle_langs[filename] = [lc]

subtitle_language_dir = language.replace("-", "_")

for content in content_list:
# Some nodes are duplicated, but they require the same information
# regardless of where they appear in the topic tree
Expand All @@ -64,41 +66,51 @@ def update_content_availability(content_list, language="en", channel="khan"):
# Ignore topics, as we only want to update their availability after we have updated the rest.
continue
else:
file_id = content.get("youtube_id", content.get("id"))
file_id = content.get("youtube_id")
default_thumbnail = create_thumbnail_url(content.get("id"))
format = content.get("format", "")
filename = file_id + "." + format
filename = file_id + "." + format if file_id else None

# Get list of subtitle language codes currently available
subtitle_lang_codes = subtitle_langs.get("{id}.vtt".format(id=content.get("id")), [])

if filename in contents_folder or language in subtitle_lang_codes:
if (filename not in contents_folder) and language in subtitle_lang_codes:
# The file is not available, but it might be available in English and can be subtitled
if content.get("id") + "." + format in contents_folder:
file_id = content.get("id")
filename = file_id + "." + format
else:
file_id = None
else:
# File for this language is available and downloaded, so let's stamp the file size on it!
update["size_on_disk"] = get_local_video_size(content.get("youtube_id"))
if file_id:
update["available"] = True
thumbnail = create_thumbnail_url(file_id) or default_thumbnail
update["content_urls"] = {
"stream": django_settings.CONTENT_URL + filename,
"stream_type": "{kind}/{format}".format(kind=content.get("kind").lower(), format=format),
"thumbnail": thumbnail,
}
elif django_settings.BACKUP_VIDEO_SOURCE:
file_id = content.get("youtube_id", content.get("id"))
update["available"] = True
update["content_urls"] = {
"stream": django_settings.BACKUP_VIDEO_SOURCE.format(youtube_id=file_id, video_format=format),
"stream_type": "{kind}/{format}".format(kind=content.get("kind").lower(), format=format),
"thumbnail": django_settings.BACKUP_VIDEO_SOURCE.format(youtube_id=file_id, video_format="png"),
}
if filename and filename in contents_folder:
update["files_complete"] = 1
# File for this language is available and downloaded, so let's stamp the file size on it!
update["size_on_disk"] = get_local_video_size(content.get("youtube_id"))
else:
# The video file for this content item does not exist. Set the files_complete and size_on_disk to 0
if content.get("files_complete"):
update["files_complete"] = 0
if content.get("size_on_disk"):
update["size_on_disk"] = 0
# Set file_id to None as a flag that this file should not be used in any later processing.
file_id = None

if not file_id and subtitle_language_dir in subtitle_lang_codes and (content.get("id") + "." + format in contents_folder):
# The file is not available in this language, but it is available in English and can be subtitled
file_id = content.get("id")
filename = file_id + "." + format

if file_id:
# We have a valid file_id (i.e. some file that we can use is available locally)
update["available"] = True
thumbnail = create_thumbnail_url(file_id) or default_thumbnail
update["content_urls"] = {
"stream": django_settings.CONTENT_URL + filename,
"stream_type": "{kind}/{format}".format(kind=content.get("kind").lower(), format=format),
"thumbnail": thumbnail,
}
elif django_settings.BACKUP_VIDEO_SOURCE:
# If the file is not available locally, but we are running the demo server, we want to serve the files
# from the Internet.
file_id = content.get("youtube_id", content.get("id"))
update["available"] = True
update["content_urls"] = {
"stream": django_settings.BACKUP_VIDEO_SOURCE.format(youtube_id=file_id, video_format=format),
"stream_type": "{kind}/{format}".format(kind=content.get("kind").lower(), format=format),
"thumbnail": django_settings.BACKUP_VIDEO_SOURCE.format(youtube_id=file_id, video_format="png"),
}

if update.get("available"):
# Don't bother doing this work if the video is not available at all
Expand All @@ -113,12 +125,9 @@ def update_content_availability(content_list, language="en", channel="khan"):
# Sort all subtitle URLs by language code
update["subtitle_urls"] = sorted(subtitle_urls, key=lambda x: x.get("code", ""))

update["files_complete"] = 1

# Content is currently flagged as available, but is not. Flag as unavailable.
if content.get("available") and "available" not in update:
update["available"] = False
update["files_complete"] = 0
update["size_on_disk"] = 0


yield content.get("path"), update
18 changes: 12 additions & 6 deletions kalite/topic_tools/content_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,19 +395,25 @@ def get_topic_contents(kinds=None, topic_id=None, **kwargs):


@set_database
def get_download_youtube_ids(paths=None, **kwargs):
def get_download_youtube_ids(paths=None, downloaded=False, **kwargs):
"""
Convenience function for taking a list of content ids and returning
all associated youtube_ids for downloads, regardless of whether the input
paths are paths for content nodes or topic nodes
:param paths: A list of paths to nodes - used to ensure uniqueness.
:param downloaded: Boolean to select whether to return files that have been downloaded already or not.
:return: A unique list of youtube_ids as strings.
"""
if paths:
youtube_ids = dict()
for path in paths:
selector = (Item.kind != "Topic") & (Item.path.contains(path)) & (Item.youtube_id.is_null(False))

if downloaded:
selector &= Item.files_complete > 0
else:
selector &= Item.files_complete == 0

youtube_ids.update(dict([item for item in Item.select(Item.youtube_id, Item.title).where(selector).tuples() if item[0]]))

return youtube_ids
Expand Down Expand Up @@ -587,7 +593,7 @@ def update_item(update=None, path=None, **kwargs):
item.save()


def iterator_content_items(ids=None, **kwargs):
def iterator_content_items(ids=None, channel="khan", language="en", **kwargs):
"""
Generator to iterate over content items specified by ids,
run update content availability on that item and then yield the
Expand All @@ -601,13 +607,13 @@ def iterator_content_items(ids=None, **kwargs):
items = Item.select().dicts().iterator()

mapped_items = itertools.imap(unparse_model_data, items)
updated_mapped_items = update_content_availability(mapped_items)
updated_mapped_items = update_content_availability(mapped_items, channel=channel, language=language)

for path, update in updated_mapped_items:
yield path, update


def iterator_content_items_by_youtube_id(ids=None, **kwargs):
def iterator_content_items_by_youtube_id(ids=None, channel="khan", language="en", **kwargs):
"""
Generator to iterate over content items specified by youtube ids,
run update content availability on that item and then yield the
Expand All @@ -621,7 +627,7 @@ def iterator_content_items_by_youtube_id(ids=None, **kwargs):
items = Item.select().dicts().iterator()

mapped_items = itertools.imap(unparse_model_data, items)
updated_mapped_items = update_content_availability(mapped_items)
updated_mapped_items = update_content_availability(mapped_items, channel=channel, language=language)

for path, update in updated_mapped_items:
yield path, update
Expand Down Expand Up @@ -661,7 +667,7 @@ def annotate_content_models(channel="khan", language="en", ids=None, iterator_co

db = kwargs.get("db")
if db:
content_models = iterator_content_items(ids=ids)
content_models = iterator_content_items(ids=ids, channel=channel, language=language)
with db.atomic() as transaction:
def recurse_availability_up_tree(node, available):
if not node.parent:
Expand Down
4 changes: 2 additions & 2 deletions kalite/updates/api_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def start_video_download(request):

lang = json.loads(request.body or "{}").get("lang", "en")

youtube_ids = get_download_youtube_ids(paths, language=lang)
youtube_ids = get_download_youtube_ids(paths, language=lang, downloaded=False)

queue = VideoQueue()

Expand All @@ -150,7 +150,7 @@ def delete_videos(request):

lang = json.loads(request.body or "{}").get("lang", "en")

youtube_ids = get_download_youtube_ids(paths, language=lang)
youtube_ids = get_download_youtube_ids(paths, language=lang, downloaded=True)

num_deleted = 0

Expand Down
61 changes: 33 additions & 28 deletions kalite/updates/management/commands/videodownload.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,34 +138,39 @@ def handle(self, *args, **options):
try:

progress_callback = partial(self.download_progress_callback, video)
try:
# Download via urllib
download_video(video.get("youtube_id"), callback=progress_callback)

except URLNotFound:
# Video was not found on amazon cloud service,
# either due to a KA mistake, or due to the fact
# that it's a dubbed video.
#
# We can use youtube-dl to get that video!!
logging.debug(_("Retrieving youtube video %(youtube_id)s via youtube-dl") % {"youtube_id": video.get("youtube_id")})

def youtube_dl_cb(stats, progress_callback, *args, **kwargs):
if stats['status'] == "finished":
percent = 100.
elif stats['status'] == "downloading":
percent = 100. * stats['downloaded_bytes'] / stats['total_bytes']
else:
percent = 0.
progress_callback(percent=percent)
scrape_video(video.get("youtube_id"), quiet=not settings.DEBUG, callback=partial(youtube_dl_cb, progress_callback=progress_callback))

except IOError as e:
logging.exception(e)
failed_youtube_ids.append(video.get("youtube_id"))
video_queue.remove_file(video.get("youtube_id"))
time.sleep(10)
continue

# Don't try to download a file that already exists in the content dir - just say it was successful
# and call it a day!
if not os.path.exists(os.path.join(settings.CONTENT_ROOT, "{id}.mp4".format(id=video.get("youtube_id")))):

try:
# Download via urllib
download_video(video.get("youtube_id"), callback=progress_callback)

except URLNotFound:
# Video was not found on amazon cloud service,
# either due to a KA mistake, or due to the fact
# that it's a dubbed video.
#
# We can use youtube-dl to get that video!!
logging.debug(_("Retrieving youtube video %(youtube_id)s via youtube-dl") % {"youtube_id": video.get("youtube_id")})

def youtube_dl_cb(stats, progress_callback, *args, **kwargs):
if stats['status'] == "finished":
percent = 100.
elif stats['status'] == "downloading":
percent = 100. * stats['downloaded_bytes'] / stats['total_bytes']
else:
percent = 0.
progress_callback(percent=percent)
scrape_video(video.get("youtube_id"), quiet=not settings.DEBUG, callback=partial(youtube_dl_cb, progress_callback=progress_callback))

except IOError as e:
logging.exception(e)
failed_youtube_ids.append(video.get("youtube_id"))
video_queue.remove_file(video.get("youtube_id"))
time.sleep(10)
continue

# If we got here, we downloaded ... somehow :)
handled_youtube_ids.append(video.get("youtube_id"))
Expand Down