++#include "internal.h"
++
++#ifdef CONFIG_X86
++#undef memcmp
++
++#ifdef CONFIG_X86_32
++#define memcmp memcmpx86_32
++/*
++ * Compare 4-byte-aligned address s1 and s2, with length n
++ */
++int memcmpx86_32(void *s1, void *s2, size_t n)
++{
++ size_t num = n / 4;
++ register int res;
++
++ __asm__ __volatile__
++ (
++ "testl %3,%3\n\t"
++ "repe; cmpsd\n\t"
++ "je 1f\n\t"
++ "sbbl %0,%0\n\t"
++ "orl $1,%0\n"
++ "1:"
++ : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
++ : "0" (0)
++ : "cc");
++
++ return res;
++}
++
++/*
++ * Check the page is all zero ?
++ */
++static int is_full_zero(const void *s1, size_t len)
++{
++ unsigned char same;
++
++ len /= 4;
++
++ __asm__ __volatile__
++ ("repe; scasl;"
++ "sete %0"
++ : "=qm" (same), "+D" (s1), "+c" (len)
++ : "a" (0)
++ : "cc");
++
++ return same;
++}
++
++
++#elif defined(CONFIG_X86_64)
++#define memcmp memcmpx86_64
++/*
++ * Compare 8-byte-aligned address s1 and s2, with length n
++ */
++int memcmpx86_64(void *s1, void *s2, size_t n)
++{
++ size_t num = n / 8;
++ register int res;
++
++ __asm__ __volatile__
++ (
++ "testq %q3,%q3\n\t"
++ "repe; cmpsq\n\t"
++ "je 1f\n\t"
++ "sbbq %q0,%q0\n\t"
++ "orq $1,%q0\n"
++ "1:"
++ : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
++ : "0" (0)
++ : "cc");
++
++ return res;
++}
++
++static int is_full_zero(const void *s1, size_t len)
++{
++ unsigned char same;
++
++ len /= 8;
++
++ __asm__ __volatile__
++ ("repe; scasq;"
++ "sete %0"
++ : "=qm" (same), "+D" (s1), "+c" (len)
++ : "a" (0)
++ : "cc");
++
++ return same;
++}
++
++#endif
++#else
++static int is_full_zero(const void *s1, size_t len)
++{
++ unsigned long *src = s1;
++ int i;
++
++ len /= sizeof(*src);
++
++ for (i = 0; i < len; i++) {
++ if (src[i])
++ return 0;
++ }
++
++ return 1;
++}
++#endif
++
++#define UKSM_RUNG_ROUND_FINISHED (1 << 0)
++#define TIME_RATIO_SCALE 10000
++
++#define SLOT_TREE_NODE_SHIFT 8
++#define SLOT_TREE_NODE_STORE_SIZE (1UL << SLOT_TREE_NODE_SHIFT)
++struct slot_tree_node {
++ unsigned long size;
++ struct sradix_tree_node snode;
++ void *stores[SLOT_TREE_NODE_STORE_SIZE];
++};
++
++static struct kmem_cache *slot_tree_node_cachep;
++
++static struct sradix_tree_node *slot_tree_node_alloc(void)
++{
++ struct slot_tree_node *p;
++
++ p = kmem_cache_zalloc(slot_tree_node_cachep, GFP_KERNEL |
++ __GFP_NORETRY | __GFP_NOWARN);
++ if (!p)
++ return NULL;
++
++ return &p->snode;
++}
++
++static void slot_tree_node_free(struct sradix_tree_node *node)
++{
++ struct slot_tree_node *p;
++
++ p = container_of(node, struct slot_tree_node, snode);
++ kmem_cache_free(slot_tree_node_cachep, p);
++}
++
++static void slot_tree_node_extend(struct sradix_tree_node *parent,
++ struct sradix_tree_node *child)
++{
++ struct slot_tree_node *p, *c;
++
++ p = container_of(parent, struct slot_tree_node, snode);
++ c = container_of(child, struct slot_tree_node, snode);
++
++ p->size += c->size;
++}
++
++void slot_tree_node_assign(struct sradix_tree_node *node,
++ unsigned int index, void *item)
++{
++ struct vma_slot *slot = item;
++ struct slot_tree_node *cur;
++
++ slot->snode = node;
++ slot->sindex = index;
++
++ while (node) {
++ cur = container_of(node, struct slot_tree_node, snode);
++ cur->size += slot->pages;
++ node = node->parent;
++ }
++}
++
++void slot_tree_node_rm(struct sradix_tree_node *node, unsigned int offset)
++{
++ struct vma_slot *slot;
++ struct slot_tree_node *cur;
++ unsigned long pages;
++
++ if (node->height == 1) {
++ slot = node->stores[offset];
++ pages = slot->pages;
++ } else {
++ cur = container_of(node->stores[offset],
++ struct slot_tree_node, snode);
++ pages = cur->size;
++ }
++
++ while (node) {
++ cur = container_of(node, struct slot_tree_node, snode);
++ cur->size -= pages;
++ node = node->parent;
++ }
++}
++
++unsigned long slot_iter_index;
++int slot_iter(void *item, unsigned long height)
++{
++ struct slot_tree_node *node;
++ struct vma_slot *slot;
++
++ if (height == 1) {
++ slot = item;
++ if (slot_iter_index < slot->pages) {
++ /*in this one*/
++ return 1;
++ } else {
++ slot_iter_index -= slot->pages;
++ return 0;
++ }
++
++ } else {
++ node = container_of(item, struct slot_tree_node, snode);
++ if (slot_iter_index < node->size) {
++ /*in this one*/
++ return 1;
++ } else {
++ slot_iter_index -= node->size;
++ return 0;
++ }
++ }
++}
++
++
++static inline void slot_tree_init_root(struct sradix_tree_root *root)
++{
++ init_sradix_tree_root(root, SLOT_TREE_NODE_SHIFT);
++ root->alloc = slot_tree_node_alloc;
++ root->free = slot_tree_node_free;
++ root->extend = slot_tree_node_extend;
++ root->assign = slot_tree_node_assign;
++ root->rm = slot_tree_node_rm;
++}
++
++void slot_tree_init(void)
++{
++ slot_tree_node_cachep = kmem_cache_create("slot_tree_node",
++ sizeof(struct slot_tree_node), 0,
++ SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
++ NULL);
++}
++
++
++/* Each rung of this ladder is a list of VMAs having a same scan ratio */
++struct scan_rung {
++ //struct list_head scanned_list;
++ struct sradix_tree_root vma_root;
++ struct sradix_tree_root vma_root2;
++
++ struct vma_slot *current_scan;
++ unsigned long current_offset;
++
++ /*
++ * The initial value for current_offset, it should loop over
++ * [0~ step - 1] to let all slot have its chance to be scanned.
++ */
++ unsigned long offset_init;
++ unsigned long step; /* dynamic step for current_offset */
++ unsigned int flags;
++ unsigned long pages_to_scan;
++ //unsigned long fully_scanned_slots;
++ /*
++ * a little bit tricky - if cpu_time_ratio > 0, then the value is the
++ * the cpu time ratio it can spend in rung_i for every scan
++ * period. if < 0, then it is the cpu time ratio relative to the
++ * max cpu percentage user specified. Both in unit of
++ * 1/TIME_RATIO_SCALE
++ */
++ int cpu_ratio;
++
++ /*
++ * How long it will take for all slots in this rung to be fully
++ * scanned? If it's zero, we don't care about the cover time:
++ * it's fully scanned.
++ */
++ unsigned int cover_msecs;
++ //unsigned long vma_num;
++ //unsigned long pages; /* Sum of all slot's pages in rung */
++};
++
++/**
++ * node of either the stable or unstale rbtree
++ *
++ */
++struct tree_node {
++ struct rb_node node; /* link in the main (un)stable rbtree */
++ struct rb_root sub_root; /* rb_root for sublevel collision rbtree */
++ u32 hash;
++ unsigned long count; /* TODO: merged with sub_root */
++ struct list_head all_list; /* all tree nodes in stable/unstable tree */
++};
++
++/**
++ * struct stable_node - node of the stable rbtree
++ * @node: rb node of this ksm page in the stable tree
++ * @hlist: hlist head of rmap_items using this ksm page
++ * @kpfn: page frame number of this ksm page
++ */
++struct stable_node {
++ struct rb_node node; /* link in sub-rbtree */
++ struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */
++ struct hlist_head hlist;
++ unsigned long kpfn;
++ u32 hash_max; /* if ==0 then it's not been calculated yet */
++ struct list_head all_list; /* in a list for all stable nodes */
++};
++
++/**
++ * struct node_vma - group rmap_items linked in a same stable
++ * node together.
++ */
++struct node_vma {
++ union {
++ struct vma_slot *slot;
++ unsigned long key; /* slot is used as key sorted on hlist */
++ };
++ struct hlist_node hlist;
++ struct hlist_head rmap_hlist;
++ struct stable_node *head;
++};
++
++/**
++ * struct rmap_item - reverse mapping item for virtual addresses
++ * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
++ * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
++ * @mm: the memory structure this rmap_item is pointing into
++ * @address: the virtual address this rmap_item tracks (+ flags in low bits)
++ * @node: rb node of this rmap_item in the unstable tree
++ * @head: pointer to stable_node heading this list in the stable tree
++ * @hlist: link into hlist of rmap_items hanging off that stable_node
++ */
++struct rmap_item {
++ struct vma_slot *slot;
++ struct page *page;
++ unsigned long address; /* + low bits used for flags below */
++ unsigned long hash_round;
++ unsigned long entry_index;
++ union {
++ struct {/* when in unstable tree */
++ struct rb_node node;
++ struct tree_node *tree_node;
++ u32 hash_max;
++ };
++ struct { /* when in stable tree */
++ struct node_vma *head;
++ struct hlist_node hlist;
++ struct anon_vma *anon_vma;
++ };
++ };
++} __aligned(4);
++
++struct rmap_list_entry {
++ union {
++ struct rmap_item *item;
++ unsigned long addr;
++ };
++ /* lowest bit is used for is_addr tag */
++} __aligned(4); /* 4 aligned to fit in to pages*/
++
++
++/* Basic data structure definition ends */
++
++
++/*
++ * Flags for rmap_item to judge if it's listed in the stable/unstable tree.
++ * The flags use the low bits of rmap_item.address
++ */
++#define UNSTABLE_FLAG 0x1
++#define STABLE_FLAG 0x2
++#define get_rmap_addr(x) ((x)->address & PAGE_MASK)
++
++/*
++ * rmap_list_entry helpers
++ */
++#define IS_ADDR_FLAG 1
++#define is_addr(ptr) ((unsigned long)(ptr) & IS_ADDR_FLAG)
++#define set_is_addr(ptr) ((ptr) |= IS_ADDR_FLAG)
++#define get_clean_addr(ptr) (((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG))
++
++
++/*
++ * High speed caches for frequently allocated and freed structs
++ */
++static struct kmem_cache *rmap_item_cache;
++static struct kmem_cache *stable_node_cache;
++static struct kmem_cache *node_vma_cache;
++static struct kmem_cache *vma_slot_cache;
++static struct kmem_cache *tree_node_cache;
++#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\
++ sizeof(struct __struct), __alignof__(struct __struct),\
++ (__flags), NULL)
++
++/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */
++#define SCAN_LADDER_SIZE 4
++static struct scan_rung uksm_scan_ladder[SCAN_LADDER_SIZE];
++
++/* The evaluation rounds uksmd has finished */
++static unsigned long long uksm_eval_round = 1;
++
++/*
++ * we add 1 to this var when we consider we should rebuild the whole
++ * unstable tree.
++ */
++static unsigned long uksm_hash_round = 1;
++
++/*
++ * How many times the whole memory is scanned.
++ */
++static unsigned long long fully_scanned_round = 1;
++
++/* The total number of virtual pages of all vma slots */
++static u64 uksm_pages_total;
++
++/* The number of pages has been scanned since the start up */
++static u64 uksm_pages_scanned;
++
++static u64 scanned_virtual_pages;
++
++/* The number of pages has been scanned since last encode_benefit call */
++static u64 uksm_pages_scanned_last;
++
++/* If the scanned number is tooo large, we encode it here */
++static u64 pages_scanned_stored;
++
++static unsigned long pages_scanned_base;
++
++/* The number of nodes in the stable tree */
++static unsigned long uksm_pages_shared;
++
++/* The number of page slots additionally sharing those nodes */
++static unsigned long uksm_pages_sharing;
++
++/* The number of nodes in the unstable tree */
++static unsigned long uksm_pages_unshared;
++
++/*
++ * Milliseconds ksmd should sleep between scans,
++ * >= 100ms to be consistent with
++ * scan_time_to_sleep_msec()
++ */
++static unsigned int uksm_sleep_jiffies;
++
++/* The real value for the uksmd next sleep */
++static unsigned int uksm_sleep_real;
++
++/* Saved value for user input uksm_sleep_jiffies when it's enlarged */
++static unsigned int uksm_sleep_saved;
++
++/* Max percentage of cpu utilization ksmd can take to scan in one batch */
++static unsigned int uksm_max_cpu_percentage;
++
++static int uksm_cpu_governor;
++
++static char *uksm_cpu_governor_str[4] = { "full", "medium", "low", "quiet" };
++
++struct uksm_cpu_preset_s {
++ int cpu_ratio[SCAN_LADDER_SIZE];
++ unsigned int cover_msecs[SCAN_LADDER_SIZE];
++ unsigned int max_cpu; /* percentage */
++};
++
++struct uksm_cpu_preset_s uksm_cpu_preset[4] = {
++ { {20, 40, -2500, -10000}, {1000, 500, 200, 50}, 95},
++ { {20, 30, -2500, -10000}, {1000, 500, 400, 100}, 50},
++ { {10, 20, -5000, -10000}, {1500, 1000, 1000, 250}, 20},
++ { {10, 20, 40, 75}, {2000, 1000, 1000, 1000}, 1},
++};
++
++/* The default value for uksm_ema_page_time if it's not initialized */
++#define UKSM_PAGE_TIME_DEFAULT 500
++
++/*cost to scan one page by expotional moving average in nsecs */
++static unsigned long uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;
++
++/* The expotional moving average alpha weight, in percentage. */
++#define EMA_ALPHA 20
++
++/*
++ * The threshold used to filter out thrashing areas,
++ * If it == 0, filtering is disabled, otherwise it's the percentage up-bound
++ * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio
++ * will be considered as having a zero duplication ratio.
++ */
++static unsigned int uksm_thrash_threshold = 50;
++
++/* How much dedup ratio is considered to be abundant*/
++static unsigned int uksm_abundant_threshold = 10;
++
++/* All slots having merged pages in this eval round. */
++struct list_head vma_slot_dedup = LIST_HEAD_INIT(vma_slot_dedup);
++
++/* How many times the ksmd has slept since startup */
++static unsigned long long uksm_sleep_times;
++
++#define UKSM_RUN_STOP 0
++#define UKSM_RUN_MERGE 1
++static unsigned int uksm_run = 1;
++
++static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait);
++static DEFINE_MUTEX(uksm_thread_mutex);
++
++/*
++ * List vma_slot_new is for newly created vma_slot waiting to be added by
++ * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to
++ * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding
++ * VMA has been removed/freed.
++ */
++struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new);
++struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd);
++struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del);
++static DEFINE_SPINLOCK(vma_slot_list_lock);
++
++/* The unstable tree heads */
++static struct rb_root root_unstable_tree = RB_ROOT;
++
++/*
++ * All tree_nodes are in a list to be freed at once when unstable tree is
++ * freed after each scan round.
++ */
++static struct list_head unstable_tree_node_list =
++ LIST_HEAD_INIT(unstable_tree_node_list);
++
++/* List contains all stable nodes */
++static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list);
++
++/*
++ * When the hash strength is changed, the stable tree must be delta_hashed and
++ * re-structured. We use two set of below structs to speed up the
++ * re-structuring of stable tree.
++ */
++static struct list_head
++stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]),
++ LIST_HEAD_INIT(stable_tree_node_list[1])};
++
++static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0];
++static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT};
++static struct rb_root *root_stable_treep = &root_stable_tree[0];
++static unsigned long stable_tree_index;
++
++/* The hash strength needed to hash a full page */
++#define HASH_STRENGTH_FULL (PAGE_SIZE / sizeof(u32))
++
++/* The hash strength needed for loop-back hashing */
++#define HASH_STRENGTH_MAX (HASH_STRENGTH_FULL + 10)
++
++/* The random offsets in a page */
++static u32 *random_nums;
++
++/* The hash strength */
++static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4;
++
++/* The delta value each time the hash strength increases or decreases */
++static unsigned long hash_strength_delta;
++#define HASH_STRENGTH_DELTA_MAX 5
++
++/* The time we have saved due to random_sample_hash */
++static u64 rshash_pos;
++
++/* The time we have wasted due to hash collision */
++static u64 rshash_neg;
++
++struct uksm_benefit {
++ u64 pos;
++ u64 neg;
++ u64 scanned;
++ unsigned long base;
++} benefit;
++
++/*
++ * The relative cost of memcmp, compared to 1 time unit of random sample
++ * hash, this value is tested when ksm module is initialized
++ */
++static unsigned long memcmp_cost;
++
++static unsigned long rshash_neg_cont_zero;
++static unsigned long rshash_cont_obscure;
++
++/* The possible states of hash strength adjustment heuristic */
++enum rshash_states {
++ RSHASH_STILL,
++ RSHASH_TRYUP,
++ RSHASH_TRYDOWN,
++ RSHASH_NEW,
++ RSHASH_PRE_STILL,
++};
++
++/* The possible direction we are about to adjust hash strength */
++enum rshash_direct {
++ GO_UP,
++ GO_DOWN,
++ OBSCURE,
++ STILL,
++};
++
++/* random sampling hash state machine */
++static struct {
++ enum rshash_states state;
++ enum rshash_direct pre_direct;
++ u8 below_count;
++ /* Keep a lookup window of size 5, iff above_count/below_count > 3
++ * in this window we stop trying.
++ */
++ u8 lookup_window_index;
++ u64 stable_benefit;
++ unsigned long turn_point_down;
++ unsigned long turn_benefit_down;
++ unsigned long turn_point_up;
++ unsigned long turn_benefit_up;
++ unsigned long stable_point;
++} rshash_state;
++
++/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/
++static u32 *zero_hash_table;
++
++static inline struct node_vma *alloc_node_vma(void)
++{
++ struct node_vma *node_vma;
++
++ node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL |
++ __GFP_NORETRY | __GFP_NOWARN);
++ if (node_vma) {
++ INIT_HLIST_HEAD(&node_vma->rmap_hlist);
++ INIT_HLIST_NODE(&node_vma->hlist);
++ }
++ return node_vma;
++}
++
++static inline void free_node_vma(struct node_vma *node_vma)
++{
++ kmem_cache_free(node_vma_cache, node_vma);
++}
++
++
++static inline struct vma_slot *alloc_vma_slot(void)
++{
++ struct vma_slot *slot;
++
++ /*
++ * In case ksm is not initialized by now.
++ * Oops, we need to consider the call site of uksm_init() in the future.
++ */
++ if (!vma_slot_cache)
++ return NULL;
++
++ slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL |
++ __GFP_NORETRY | __GFP_NOWARN);
++ if (slot) {
++ INIT_LIST_HEAD(&slot->slot_list);
++ INIT_LIST_HEAD(&slot->dedup_list);
++ slot->flags |= UKSM_SLOT_NEED_RERAND;
++ }
++ return slot;
++}
++
++static inline void free_vma_slot(struct vma_slot *vma_slot)
++{
++ kmem_cache_free(vma_slot_cache, vma_slot);
++}
++
++
++
++static inline struct rmap_item *alloc_rmap_item(void)
++{
++ struct rmap_item *rmap_item;
++
++ rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
++ __GFP_NORETRY | __GFP_NOWARN);
++ if (rmap_item) {
++ /* bug on lowest bit is not clear for flag use */
++ BUG_ON(is_addr(rmap_item));
++ }
++ return rmap_item;
++}
++
++static inline void free_rmap_item(struct rmap_item *rmap_item)
++{
++ rmap_item->slot = NULL; /* debug safety */
++ kmem_cache_free(rmap_item_cache, rmap_item);
++}
++
++static inline struct stable_node *alloc_stable_node(void)
++{
++ struct stable_node *node;
++
++ node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL |
++ __GFP_NORETRY | __GFP_NOWARN);
++ if (!node)
++ return NULL;
++
++ INIT_HLIST_HEAD(&node->hlist);
++ list_add(&node->all_list, &stable_node_list);
++ return node;
++}
++
++static inline void free_stable_node(struct stable_node *stable_node)
++{
++ list_del(&stable_node->all_list);
++ kmem_cache_free(stable_node_cache, stable_node);
++}
++
++static inline struct tree_node *alloc_tree_node(struct list_head *list)
++{
++ struct tree_node *node;
++
++ node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL |
++ __GFP_NORETRY | __GFP_NOWARN);
++ if (!node)
++ return NULL;
++
++ list_add(&node->all_list, list);
++ return node;
++}
++
++static inline void free_tree_node(struct tree_node *node)
++{
++ list_del(&node->all_list);
++ kmem_cache_free(tree_node_cache, node);
++}
++
++static void uksm_drop_anon_vma(struct rmap_item *rmap_item)
++{
++ struct anon_vma *anon_vma = rmap_item->anon_vma;
++
++ put_anon_vma(anon_vma);
++}
++
++
++/**
++ * Remove a stable node from stable_tree, may unlink from its tree_node and
++ * may remove its parent tree_node if no other stable node is pending.
++ *
++ * @stable_node The node need to be removed
++ * @unlink_rb Will this node be unlinked from the rbtree?
++ * @remove_tree_ node Will its tree_node be removed if empty?
++ */
++static void remove_node_from_stable_tree(struct stable_node *stable_node,
++ int unlink_rb, int remove_tree_node)
++{
++ struct node_vma *node_vma;
++ struct rmap_item *rmap_item;
++ struct hlist_node *n;
++
++ if (!hlist_empty(&stable_node->hlist)) {
++ hlist_for_each_entry_safe(node_vma, n,
++ &stable_node->hlist, hlist) {
++ hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) {
++ uksm_pages_sharing--;
++
++ uksm_drop_anon_vma(rmap_item);
++ rmap_item->address &= PAGE_MASK;
++ }
++ free_node_vma(node_vma);
++ cond_resched();
++ }
++
++ /* the last one is counted as shared */
++ uksm_pages_shared--;
++ uksm_pages_sharing++;
++ }
++
++ if (stable_node->tree_node && unlink_rb) {
++ rb_erase(&stable_node->node,
++ &stable_node->tree_node->sub_root);
++
++ if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) &&
++ remove_tree_node) {
++ rb_erase(&stable_node->tree_node->node,
++ root_stable_treep);
++ free_tree_node(stable_node->tree_node);
++ } else {
++ stable_node->tree_node->count--;
++ }
++ }
++
++ free_stable_node(stable_node);
++}
++
++
++/*
++ * get_uksm_page: checks if the page indicated by the stable node
++ * is still its ksm page, despite having held no reference to it.
++ * In which case we can trust the content of the page, and it
++ * returns the gotten page; but if the page has now been zapped,
++ * remove the stale node from the stable tree and return NULL.
++ *
++ * You would expect the stable_node to hold a reference to the ksm page.
++ * But if it increments the page's count, swapping out has to wait for
++ * ksmd to come around again before it can free the page, which may take
++ * seconds or even minutes: much too unresponsive. So instead we use a
++ * "keyhole reference": access to the ksm page from the stable node peeps
++ * out through its keyhole to see if that page still holds the right key,
++ * pointing back to this stable node. This relies on freeing a PageAnon
++ * page to reset its page->mapping to NULL, and relies on no other use of
++ * a page to put something that might look like our key in page->mapping.
++ *
++ * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
++ * but this is different - made simpler by uksm_thread_mutex being held, but
++ * interesting for assuming that no other use of the struct page could ever
++ * put our expected_mapping into page->mapping (or a field of the union which
++ * coincides with page->mapping). The RCU calls are not for KSM at all, but
++ * to keep the page_count protocol described with page_cache_get_speculative.
++ *
++ * Note: it is possible that get_uksm_page() will return NULL one moment,
++ * then page the next, if the page is in between page_freeze_refs() and
++ * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
++ * is on its way to being freed; but it is an anomaly to bear in mind.
++ *
++ * @unlink_rb: if the removal of this node will firstly unlink from
++ * its rbtree. stable_node_reinsert will prevent this when restructuring the
++ * node from its old tree.
++ *
++ * @remove_tree_node: if this is the last one of its tree_node, will the
++ * tree_node be freed ? If we are inserting stable node, this tree_node may
++ * be reused, so don't free it.
++ */
++static struct page *get_uksm_page(struct stable_node *stable_node,
++ int unlink_rb, int remove_tree_node)
++{
++ struct page *page;
++ void *expected_mapping;
++ unsigned long kpfn;
++
++ expected_mapping = (void *)((unsigned long)stable_node |
++ PAGE_MAPPING_KSM);
++again:
++ kpfn = READ_ONCE(stable_node->kpfn);
++ page = pfn_to_page(kpfn);
++
++ /*
++ * page is computed from kpfn, so on most architectures reading
++ * page->mapping is naturally ordered after reading node->kpfn,
++ * but on Alpha we need to be more careful.
++ */
++ smp_read_barrier_depends();
++
++ if (READ_ONCE(page->mapping) != expected_mapping)
++ goto stale;
++
++ /*
++ * We cannot do anything with the page while its refcount is 0.
++ * Usually 0 means free, or tail of a higher-order page: in which
++ * case this node is no longer referenced, and should be freed;
++ * however, it might mean that the page is under page_freeze_refs().
++ * The __remove_mapping() case is easy, again the node is now stale;
++ * but if page is swapcache in migrate_page_move_mapping(), it might
++ * still be our page, in which case it's essential to keep the node.
++ */
++ while (!get_page_unless_zero(page)) {
++ /*
++ * Another check for page->mapping != expected_mapping would
++ * work here too. We have chosen the !PageSwapCache test to
++ * optimize the common case, when the page is or is about to
++ * be freed: PageSwapCache is cleared (under spin_lock_irq)
++ * in the freeze_refs section of __remove_mapping(); but Anon
++ * page->mapping reset to NULL later, in free_pages_prepare().
++ */
++ if (!PageSwapCache(page))
++ goto stale;
++ cpu_relax();
++ }
++
++ if (READ_ONCE(page->mapping) != expected_mapping) {
++ put_page(page);
++ goto stale;
++ }
++
++ lock_page(page);
++ if (READ_ONCE(page->mapping) != expected_mapping) {
++ unlock_page(page);
++ put_page(page);
++ goto stale;
++ }
++ unlock_page(page);
++ return page;
++stale:
++ /*
++ * We come here from above when page->mapping or !PageSwapCache
++ * suggests that the node is stale; but it might be under migration.
++ * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(),
++ * before checking whether node->kpfn has been changed.
++ */
++ smp_rmb();
++ if (stable_node->kpfn != kpfn)
++ goto again;
++
++ remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node);
++
++ return NULL;
++}
++
++/*
++ * Removing rmap_item from stable or unstable tree.
++ * This function will clean the information from the stable/unstable tree.
++ */
++static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
++{
++ if (rmap_item->address & STABLE_FLAG) {
++ struct stable_node *stable_node;
++ struct node_vma *node_vma;
++ struct page *page;
++
++ node_vma = rmap_item->head;
++ stable_node = node_vma->head;
++ page = get_uksm_page(stable_node, 1, 1);
++ if (!page)
++ goto out;
++
++ /*
++ * page lock is needed because it's racing with
++ * try_to_unmap_ksm(), etc.
++ */
++ lock_page(page);
++ hlist_del(&rmap_item->hlist);
++
++ if (hlist_empty(&node_vma->rmap_hlist)) {
++ hlist_del(&node_vma->hlist);
++ free_node_vma(node_vma);
++ }
++ unlock_page(page);
++
++ put_page(page);
++ if (hlist_empty(&stable_node->hlist)) {
++ /* do NOT call remove_node_from_stable_tree() here,
++ * it's possible for a forked rmap_item not in
++ * stable tree while the in-tree rmap_items were
++ * deleted.
++ */
++ uksm_pages_shared--;
++ } else
++ uksm_pages_sharing--;
++
++
++ uksm_drop_anon_vma(rmap_item);
++ } else if (rmap_item->address & UNSTABLE_FLAG) {
++ if (rmap_item->hash_round == uksm_hash_round) {
++
++ rb_erase(&rmap_item->node,
++ &rmap_item->tree_node->sub_root);
++ if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) {
++ rb_erase(&rmap_item->tree_node->node,
++ &root_unstable_tree);
++
++ free_tree_node(rmap_item->tree_node);
++ } else
++ rmap_item->tree_node->count--;
++ }
++ uksm_pages_unshared--;
++ }
++
++ rmap_item->address &= PAGE_MASK;
++ rmap_item->hash_max = 0;
++
++out:
++ cond_resched(); /* we're called from many long loops */
++}
++
++static inline int slot_in_uksm(struct vma_slot *slot)
++{
++ return list_empty(&slot->slot_list);
++}
++
++/*
++ * Test if the mm is exiting
++ */
++static inline bool uksm_test_exit(struct mm_struct *mm)
++{
++ return atomic_read(&mm->mm_users) == 0;
++}
++
++static inline unsigned long vma_pool_size(struct vma_slot *slot)
++{
++ return round_up(sizeof(struct rmap_list_entry) * slot->pages,
++ PAGE_SIZE) >> PAGE_SHIFT;
++}
++
++#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta))
++
++/* must be done with sem locked */
++static int slot_pool_alloc(struct vma_slot *slot)
++{
++ unsigned long pool_size;
++
++ if (slot->rmap_list_pool)
++ return 0;
++
++ pool_size = vma_pool_size(slot);
++ slot->rmap_list_pool = kcalloc(pool_size, sizeof(struct page *),
++ GFP_KERNEL);
++ if (!slot->rmap_list_pool)
++ return -ENOMEM;
++
++ slot->pool_counts = kcalloc(pool_size, sizeof(unsigned int),
++ GFP_KERNEL);
++ if (!slot->pool_counts) {
++ kfree(slot->rmap_list_pool);
++ return -ENOMEM;
++ }
++
++ slot->pool_size = pool_size;
++ BUG_ON(CAN_OVERFLOW_U64(uksm_pages_total, slot->pages));
++ slot->flags |= UKSM_SLOT_IN_UKSM;
++ uksm_pages_total += slot->pages;
++
++ return 0;
++}
++
++/*
++ * Called after vma is unlinked from its mm
++ */
++void uksm_remove_vma(struct vm_area_struct *vma)
++{
++ struct vma_slot *slot;
++
++ if (!vma->uksm_vma_slot)
++ return;
++
++ spin_lock(&vma_slot_list_lock);
++ slot = vma->uksm_vma_slot;
++ if (!slot)
++ goto out;
++
++ if (slot_in_uksm(slot)) {
++ /**
++ * This slot has been added by ksmd, so move to the del list
++ * waiting ksmd to free it.
++ */
++ list_add_tail(&slot->slot_list, &vma_slot_del);
++ } else {
++ /**
++ * It's still on new list. It's ok to free slot directly.
++ */
++ list_del(&slot->slot_list);
++ free_vma_slot(slot);
++ }
++out:
++ vma->uksm_vma_slot = NULL;
++ spin_unlock(&vma_slot_list_lock);
++}
++
++/**
++ * Need to do two things:
++ * 1. check if slot was moved to del list
++ * 2. make sure the mmap_sem is manipulated under valid vma.
++ *
++ * My concern here is that in some cases, this may make
++ * vma_slot_list_lock() waiters to serialized further by some
++ * sem->wait_lock, can this really be expensive?
++ *
++ *
++ * @return
++ * 0: if successfully locked mmap_sem
++ * -ENOENT: this slot was moved to del list
++ * -EBUSY: vma lock failed
++ */
++static int try_down_read_slot_mmap_sem(struct vma_slot *slot)
++{
++ struct vm_area_struct *vma;
++ struct mm_struct *mm;
++ struct rw_semaphore *sem;
++
++ spin_lock(&vma_slot_list_lock);
++
++ /* the slot_list was removed and inited from new list, when it enters
++ * uksm_list. If now it's not empty, then it must be moved to del list
++ */
++ if (!slot_in_uksm(slot)) {
++ spin_unlock(&vma_slot_list_lock);
++ return -ENOENT;
++ }
++
++ BUG_ON(slot->pages != vma_pages(slot->vma));
++ /* Ok, vma still valid */
++ vma = slot->vma;
++ mm = vma->vm_mm;
++ sem = &mm->mmap_sem;
++
++ if (uksm_test_exit(mm)) {
++ spin_unlock(&vma_slot_list_lock);
++ return -ENOENT;
++ }
++
++ if (down_read_trylock(sem)) {
++ spin_unlock(&vma_slot_list_lock);
++ if (slot_pool_alloc(slot)) {
++ uksm_remove_vma(vma);
++ up_read(sem);
++ return -ENOENT;
++ }
++ return 0;
++ }
++
++ spin_unlock(&vma_slot_list_lock);
++ return -EBUSY;
++}
++
++static inline unsigned long
++vma_page_address(struct page *page, struct vm_area_struct *vma)
++{
++ pgoff_t pgoff = page->index;
++ unsigned long address;
++
++ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
++ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
++ /* page should be within @vma mapping range */
++ return -EFAULT;
++ }
++ return address;
++}
++
++
++/* return 0 on success with the item's mmap_sem locked */
++static inline int get_mergeable_page_lock_mmap(struct rmap_item *item)
++{
++ struct mm_struct *mm;
++ struct vma_slot *slot = item->slot;
++ int err = -EINVAL;
++
++ struct page *page;
++
++ /*
++ * try_down_read_slot_mmap_sem() returns non-zero if the slot
++ * has been removed by uksm_remove_vma().
++ */
++ if (try_down_read_slot_mmap_sem(slot))
++ return -EBUSY;
++
++ mm = slot->vma->vm_mm;
++
++ if (uksm_test_exit(mm))
++ goto failout_up;
++
++ page = item->page;
++ rcu_read_lock();
++ if (!get_page_unless_zero(page)) {
++ rcu_read_unlock();
++ goto failout_up;
++ }
++
++ /* No need to consider huge page here. */
++ if (item->slot->vma->anon_vma != page_anon_vma(page) ||
++ vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) {
++ /*
++ * TODO:
++ * should we release this item becase of its stale page
++ * mapping?
++ */
++ put_page(page);
++ rcu_read_unlock();
++ goto failout_up;
++ }
++ rcu_read_unlock();
++ return 0;
++
++failout_up:
++ up_read(&mm->mmap_sem);
++ return err;
++}
++
++/*
++ * What kind of VMA is considered ?
++ */
++static inline int vma_can_enter(struct vm_area_struct *vma)
++{
++ return uksm_flags_can_scan(vma->vm_flags);
++}
++
++/*
++ * Called whenever a fresh new vma is created A new vma_slot.
++ * is created and inserted into a global list Must be called.
++ * after vma is inserted to its mm.
++ */
++void uksm_vma_add_new(struct vm_area_struct *vma)
++{
++ struct vma_slot *slot;
++
++ if (!vma_can_enter(vma)) {
++ vma->uksm_vma_slot = NULL;
++ return;
++ }
++
++ slot = alloc_vma_slot();
++ if (!slot) {
++ vma->uksm_vma_slot = NULL;
++ return;
++ }
++
++ vma->uksm_vma_slot = slot;
++ vma->vm_flags |= VM_MERGEABLE;
++ slot->vma = vma;
++ slot->mm = vma->vm_mm;
++ slot->ctime_j = jiffies;
++ slot->pages = vma_pages(vma);
++ spin_lock(&vma_slot_list_lock);
++ list_add_tail(&slot->slot_list, &vma_slot_new);
++ spin_unlock(&vma_slot_list_lock);
++}
++
++/* 32/3 < they < 32/2 */
++#define shiftl 8
++#define shiftr 12
++
++#define HASH_FROM_TO(from, to) \
++for (index = from; index < to; index++) { \
++ pos = random_nums[index]; \
++ hash += key[pos]; \
++ hash += (hash << shiftl); \
++ hash ^= (hash >> shiftr); \
++}
++
++
++#define HASH_FROM_DOWN_TO(from, to) \
++for (index = from - 1; index >= to; index--) { \
++ hash ^= (hash >> shiftr); \
++ hash ^= (hash >> (shiftr*2)); \
++ hash -= (hash << shiftl); \
++ hash += (hash << (shiftl*2)); \
++ pos = random_nums[index]; \
++ hash -= key[pos]; \
++}
++
++/*
++ * The main random sample hash function.
++ */
++static u32 random_sample_hash(void *addr, u32 hash_strength)
++{
++ u32 hash = 0xdeadbeef;
++ int index, pos, loop = hash_strength;
++ u32 *key = (u32 *)addr;
++
++ if (loop > HASH_STRENGTH_FULL)
++ loop = HASH_STRENGTH_FULL;
++
++ HASH_FROM_TO(0, loop);
++
++ if (hash_strength > HASH_STRENGTH_FULL) {
++ loop = hash_strength - HASH_STRENGTH_FULL;
++ HASH_FROM_TO(0, loop);
++ }
++
++ return hash;
++}
++
++
++/**
++ * It's used when hash strength is adjusted
++ *
++ * @addr The page's virtual address
++ * @from The original hash strength
++ * @to The hash strength changed to
++ * @hash The hash value generated with "from" hash value
++ *
++ * return the hash value
++ */
++static u32 delta_hash(void *addr, int from, int to, u32 hash)
++{
++ u32 *key = (u32 *)addr;
++ int index, pos; /* make sure they are int type */
++
++ if (to > from) {
++ if (from >= HASH_STRENGTH_FULL) {
++ from -= HASH_STRENGTH_FULL;
++ to -= HASH_STRENGTH_FULL;
++ HASH_FROM_TO(from, to);
++ } else if (to <= HASH_STRENGTH_FULL) {
++ HASH_FROM_TO(from, to);
++ } else {
++ HASH_FROM_TO(from, HASH_STRENGTH_FULL);
++ HASH_FROM_TO(0, to - HASH_STRENGTH_FULL);
++ }
++ } else {
++ if (from <= HASH_STRENGTH_FULL) {
++ HASH_FROM_DOWN_TO(from, to);
++ } else if (to >= HASH_STRENGTH_FULL) {
++ from -= HASH_STRENGTH_FULL;
++ to -= HASH_STRENGTH_FULL;
++ HASH_FROM_DOWN_TO(from, to);
++ } else {
++ HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0);
++ HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to);
++ }
++ }
++
++ return hash;
++}
++
++/**
++ *
++ * Called when: rshash_pos or rshash_neg is about to overflow or a scan round
++ * has finished.
++ *
++ * return 0 if no page has been scanned since last call, 1 otherwise.
++ */
++static inline int encode_benefit(void)
++{
++ u64 scanned_delta, pos_delta, neg_delta;
++ unsigned long base = benefit.base;
++
++ scanned_delta = uksm_pages_scanned - uksm_pages_scanned_last;
++
++ if (!scanned_delta)
++ return 0;
++
++ scanned_delta >>= base;
++ pos_delta = rshash_pos >> base;
++ neg_delta = rshash_neg >> base;
++
++ if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) ||
++ CAN_OVERFLOW_U64(benefit.neg, neg_delta) ||
++ CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) {
++ benefit.scanned >>= 1;
++ benefit.neg >>= 1;
++ benefit.pos >>= 1;
++ benefit.base++;
++ scanned_delta >>= 1;
++ pos_delta >>= 1;
++ neg_delta >>= 1;
++ }
++
++ benefit.pos += pos_delta;
++ benefit.neg += neg_delta;
++ benefit.scanned += scanned_delta;
++
++ BUG_ON(!benefit.scanned);
++
++ rshash_pos = rshash_neg = 0;
++ uksm_pages_scanned_last = uksm_pages_scanned;
++
++ return 1;
++}
++
++static inline void reset_benefit(void)
++{
++ benefit.pos = 0;
++ benefit.neg = 0;
++ benefit.base = 0;
++ benefit.scanned = 0;
++}
++
++static inline void inc_rshash_pos(unsigned long delta)
++{
++ if (CAN_OVERFLOW_U64(rshash_pos, delta))
++ encode_benefit();
++
++ rshash_pos += delta;
++}
++
++static inline void inc_rshash_neg(unsigned long delta)
++{
++ if (CAN_OVERFLOW_U64(rshash_neg, delta))
++ encode_benefit();
++
++ rshash_neg += delta;
++}
++
++
++static inline u32 page_hash(struct page *page, unsigned long hash_strength,
++ int cost_accounting)
++{
++ u32 val;
++ unsigned long delta;
++
++ void *addr = kmap_atomic(page);
++
++ val = random_sample_hash(addr, hash_strength);
++ kunmap_atomic(addr);
++
++ if (cost_accounting) {
++ if (hash_strength < HASH_STRENGTH_FULL)
++ delta = HASH_STRENGTH_FULL - hash_strength;
++ else
++ delta = 0;
++
++ inc_rshash_pos(delta);
++ }
++
++ return val;
++}
++
++static int memcmp_pages_with_cost(struct page *page1, struct page *page2,
++ int cost_accounting)
++{
++ char *addr1, *addr2;
++ int ret;
++
++ addr1 = kmap_atomic(page1);
++ addr2 = kmap_atomic(page2);
++ ret = memcmp(addr1, addr2, PAGE_SIZE);
++ kunmap_atomic(addr2);
++ kunmap_atomic(addr1);
++
++ if (cost_accounting)
++ inc_rshash_neg(memcmp_cost);
++
++ return ret;
++}
++
++static inline int pages_identical_with_cost(struct page *page1, struct page *page2)
++{
++ return !memcmp_pages_with_cost(page1, page2, 0);
++}
++
++static inline int is_page_full_zero(struct page *page)
++{
++ char *addr;
++ int ret;
++
++ addr = kmap_atomic(page);
++ ret = is_full_zero(addr, PAGE_SIZE);
++ kunmap_atomic(addr);
++
++ return ret;
++}
++
++static int write_protect_page(struct vm_area_struct *vma, struct page *page,
++ pte_t *orig_pte, pte_t *old_pte)
++{
++ struct mm_struct *mm = vma->vm_mm;
++ struct page_vma_mapped_walk pvmw = {
++ .page = page,
++ .vma = vma,
++ };
++ struct mmu_notifier_range range;
++ int swapped;
++ int err = -EFAULT;
++
++ pvmw.address = page_address_in_vma(page, vma);
++ if (pvmw.address == -EFAULT)
++ goto out;
++
++ BUG_ON(PageTransCompound(page));
++
++ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, pvmw.address,
++ pvmw.address + PAGE_SIZE);
++ mmu_notifier_invalidate_range_start(&range);
++
++ if (!page_vma_mapped_walk(&pvmw))
++ goto out_mn;
++ if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
++ goto out_unlock;
++
++ if (old_pte)
++ *old_pte = *pvmw.pte;
++
++ if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
++ (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) || mm_tlb_flush_pending(mm)) {
++ pte_t entry;
++
++ swapped = PageSwapCache(page);
++ flush_cache_page(vma, pvmw.address, page_to_pfn(page));
++ /*
++ * Ok this is tricky, when get_user_pages_fast() run it doesn't
++ * take any lock, therefore the check that we are going to make
++ * with the pagecount against the mapcount is racey and
++ * O_DIRECT can happen right after the check.
++ * So we clear the pte and flush the tlb before the check
++ * this assure us that no O_DIRECT can happen after the check
++ * or in the middle of the check.
++ */
++ entry = ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte);
++ /*
++ * Check that no O_DIRECT or similar I/O is in progress on the
++ * page
++ */
++ if (page_mapcount(page) + 1 + swapped != page_count(page)) {
++ set_pte_at(mm, pvmw.address, pvmw.pte, entry);
++ goto out_unlock;
++ }
++ if (pte_dirty(entry))
++ set_page_dirty(page);
++
++ if (pte_protnone(entry))
++ entry = pte_mkclean(pte_clear_savedwrite(entry));
++ else
++ entry = pte_mkclean(pte_wrprotect(entry));
++
++ set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
++ }
++ *orig_pte = *pvmw.pte;
++ err = 0;
++
++out_unlock:
++ page_vma_mapped_walk_done(&pvmw);
++out_mn:
++ mmu_notifier_invalidate_range_end(&range);
++out:
++ return err;
++}
++
++#define MERGE_ERR_PGERR 1 /* the page is invalid cannot continue */
++#define MERGE_ERR_COLLI 2 /* there is a collision */
++#define MERGE_ERR_COLLI_MAX 3 /* collision at the max hash strength */
++#define MERGE_ERR_CHANGED 4 /* the page has changed since last hash */
++
++
++/**
++ * replace_page - replace page in vma by new ksm page
++ * @vma: vma that holds the pte pointing to page
++ * @page: the page we are replacing by kpage
++ * @kpage: the ksm page we replace page by
++ * @orig_pte: the original value of the pte
++ *
++ * Returns 0 on success, MERGE_ERR_PGERR on failure.
++ */
++static int replace_page(struct vm_area_struct *vma, struct page *page,
++ struct page *kpage, pte_t orig_pte)
++{
++ struct mm_struct *mm = vma->vm_mm;
++ struct mmu_notifier_range range;
++ pgd_t *pgd;
++ p4d_t *p4d;
++ pud_t *pud;
++ pmd_t *pmd;
++ pte_t *ptep;
++ spinlock_t *ptl;
++ pte_t entry;
++
++ unsigned long addr;
++ int err = MERGE_ERR_PGERR;
++
++ addr = page_address_in_vma(page, vma);
++ if (addr == -EFAULT)
++ goto out;
++
++ pgd = pgd_offset(mm, addr);
++ if (!pgd_present(*pgd))
++ goto out;
++
++ p4d = p4d_offset(pgd, addr);
++ pud = pud_offset(p4d, addr);
++ if (!pud_present(*pud))
++ goto out;
++
++ pmd = pmd_offset(pud, addr);
++ BUG_ON(pmd_trans_huge(*pmd));
++ if (!pmd_present(*pmd))
++ goto out;
++
++ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
++ addr + PAGE_SIZE);
++ mmu_notifier_invalidate_range_start(&range);
++
++ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
++ if (!pte_same(*ptep, orig_pte)) {
++ pte_unmap_unlock(ptep, ptl);
++ goto out_mn;
++ }
++
++ flush_cache_page(vma, addr, pte_pfn(*ptep));
++ ptep_clear_flush_notify(vma, addr, ptep);
++ entry = mk_pte(kpage, vma->vm_page_prot);
++
++ /* special treatment is needed for zero_page */
++ if ((page_to_pfn(kpage) == uksm_zero_pfn) ||
++ (page_to_pfn(kpage) == zero_pfn)) {
++ entry = pte_mkspecial(entry);
++ dec_mm_counter(mm, MM_ANONPAGES);
++ inc_zone_page_state(page, NR_UKSM_ZERO_PAGES);
++ } else {
++ get_page(kpage);
++ page_add_anon_rmap(kpage, vma, addr, false);
++ }
++
++ set_pte_at_notify(mm, addr, ptep, entry);
++
++ page_remove_rmap(page, false);
++ if (!page_mapped(page))
++ try_to_free_swap(page);
++ put_page(page);
++
++ pte_unmap_unlock(ptep, ptl);
++ err = 0;
++out_mn:
++ mmu_notifier_invalidate_range_end(&range);
++out:
++ return err;
++}
++
++
++/**
++ * Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The
++ * zero hash value at HASH_STRENGTH_MAX is used to indicated that its
++ * hash_max member has not been calculated.
++ *
++ * @page The page needs to be hashed
++ * @hash_old The hash value calculated with current hash strength
++ *
++ * return the new hash value calculated at HASH_STRENGTH_MAX
++ */
++static inline u32 page_hash_max(struct page *page, u32 hash_old)
++{
++ u32 hash_max = 0;
++ void *addr;
++
++ addr = kmap_atomic(page);
++ hash_max = delta_hash(addr, hash_strength,
++ HASH_STRENGTH_MAX, hash_old);
++
++ kunmap_atomic(addr);
++
++ if (!hash_max)
++ hash_max = 1;
++
++ inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
++ return hash_max;
++}
++
++/*
++ * We compare the hash again, to ensure that it is really a hash collision
++ * instead of being caused by page write.
++ */
++static inline int check_collision(struct rmap_item *rmap_item,
++ u32 hash)
++{
++ int err;
++ struct page *page = rmap_item->page;
++
++ /* if this rmap_item has already been hash_maxed, then the collision
++ * must appears in the second-level rbtree search. In this case we check
++ * if its hash_max value has been changed. Otherwise, the collision
++ * happens in the first-level rbtree search, so we check against it's
++ * current hash value.
++ */
++ if (rmap_item->hash_max) {
++ inc_rshash_neg(memcmp_cost);
++ inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
++
++ if (rmap_item->hash_max == page_hash_max(page, hash))
++ err = MERGE_ERR_COLLI;
++ else
++ err = MERGE_ERR_CHANGED;
++ } else {
++ inc_rshash_neg(memcmp_cost + hash_strength);
++
++ if (page_hash(page, hash_strength, 0) == hash)
++ err = MERGE_ERR_COLLI;
++ else
++ err = MERGE_ERR_CHANGED;
++ }
++
++ return err;
++}
++
++/**
++ * Try to merge a rmap_item.page with a kpage in stable node. kpage must
++ * already be a ksm page.
++ *
++ * @return 0 if the pages were merged, -EFAULT otherwise.
++ */
++static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item,
++ struct page *kpage, u32 hash)
++{
++ struct vm_area_struct *vma = rmap_item->slot->vma;
++ struct mm_struct *mm = vma->vm_mm;
++ pte_t orig_pte = __pte(0);
++ int err = MERGE_ERR_PGERR;
++ struct page *page;
++
++ if (uksm_test_exit(mm))
++ goto out;
++
++ page = rmap_item->page;
++
++ if (page == kpage) { /* ksm page forked */
++ err = 0;
++ goto out;
++ }
++
++ /*
++ * We need the page lock to read a stable PageSwapCache in
++ * write_protect_page(). We use trylock_page() instead of
++ * lock_page() because we don't want to wait here - we
++ * prefer to continue scanning and merging different pages,
++ * then come back to this page when it is unlocked.
++ */
++ if (!trylock_page(page))
++ goto out;
++
++ if (!PageAnon(page) || !PageKsm(kpage))
++ goto out_unlock;
++
++ if (PageTransCompound(page)) {
++ err = split_huge_page(page);
++ if (err)
++ goto out_unlock;
++ }
++
++ /*
++ * If this anonymous page is mapped only here, its pte may need
++ * to be write-protected. If it's mapped elsewhere, all of its
++ * ptes are necessarily already write-protected. But in either
++ * case, we need to lock and check page_count is not raised.
++ */
++ if (write_protect_page(vma, page, &orig_pte, NULL) == 0) {
++ if (pages_identical_with_cost(page, kpage))
++ err = replace_page(vma, page, kpage, orig_pte);
++ else
++ err = check_collision(rmap_item, hash);
++ }
++
++ if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
++ munlock_vma_page(page);
++ if (!PageMlocked(kpage)) {
++ unlock_page(page);
++ lock_page(kpage);
++ mlock_vma_page(kpage);
++ page = kpage; /* for final unlock */
++ }
++ }
++
++out_unlock:
++ unlock_page(page);
++out:
++ return err;
++}
++
++
++
++/**
++ * If two pages fail to merge in try_to_merge_two_pages, then we have a chance
++ * to restore a page mapping that has been changed in try_to_merge_two_pages.
++ *
++ * @return 0 on success.
++ */
++static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr,
++ pte_t orig_pte, pte_t wprt_pte)
++{
++ struct mm_struct *mm = vma->vm_mm;
++ pgd_t *pgd;
++ p4d_t *p4d;
++ pud_t *pud;
++ pmd_t *pmd;
++ pte_t *ptep;
++ spinlock_t *ptl;
++
++ int err = -EFAULT;
++
++ pgd = pgd_offset(mm, addr);
++ if (!pgd_present(*pgd))
++ goto out;
++
++ p4d = p4d_offset(pgd, addr);
++ pud = pud_offset(p4d, addr);
++ if (!pud_present(*pud))
++ goto out;
++
++ pmd = pmd_offset(pud, addr);
++ if (!pmd_present(*pmd))
++ goto out;
++
++ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
++ if (!pte_same(*ptep, wprt_pte)) {
++ /* already copied, let it be */
++ pte_unmap_unlock(ptep, ptl);
++ goto out;
++ }
++
++ /*
++ * Good boy, still here. When we still get the ksm page, it does not
++ * return to the free page pool, there is no way that a pte was changed
++ * to other page and gets back to this page. And remind that ksm page
++ * do not reuse in do_wp_page(). So it's safe to restore the original
++ * pte.
++ */
++ flush_cache_page(vma, addr, pte_pfn(*ptep));
++ ptep_clear_flush_notify(vma, addr, ptep);
++ set_pte_at_notify(mm, addr, ptep, orig_pte);
++
++ pte_unmap_unlock(ptep, ptl);
++ err = 0;
++out:
++ return err;
++}
++
++/**
++ * try_to_merge_two_pages() - take two identical pages and prepare
++ * them to be merged into one page(rmap_item->page)
++ *
++ * @return 0 if we successfully merged two identical pages into
++ * one ksm page. MERGE_ERR_COLLI if it's only a hash collision
++ * search in rbtree. MERGE_ERR_CHANGED if rmap_item has been
++ * changed since it's hashed. MERGE_ERR_PGERR otherwise.
++ *
++ */
++static int try_to_merge_two_pages(struct rmap_item *rmap_item,
++ struct rmap_item *tree_rmap_item,
++ u32 hash)
++{
++ pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0);
++ pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0);
++ struct vm_area_struct *vma1 = rmap_item->slot->vma;
++ struct vm_area_struct *vma2 = tree_rmap_item->slot->vma;
++ struct page *page = rmap_item->page;
++ struct page *tree_page = tree_rmap_item->page;
++ int err = MERGE_ERR_PGERR;
++ struct address_space *saved_mapping;
++
++
++ if (rmap_item->page == tree_rmap_item->page)
++ goto out;
++
++ if (!trylock_page(page))
++ goto out;
++
++ if (!PageAnon(page))
++ goto out_unlock;
++
++ if (PageTransCompound(page)) {
++ err = split_huge_page(page);
++ if (err)
++ goto out_unlock;
++ }
++
++ if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) {
++ unlock_page(page);
++ goto out;
++ }
++
++ /*
++ * While we hold page lock, upgrade page from
++ * PageAnon+anon_vma to PageKsm+NULL stable_node:
++ * stable_tree_insert() will update stable_node.
++ */
++ saved_mapping = page->mapping;
++ set_page_stable_node(page, NULL);
++ mark_page_accessed(page);
++ if (!PageDirty(page))
++ SetPageDirty(page);
++
++ unlock_page(page);
++
++ if (!trylock_page(tree_page))
++ goto restore_out;
++
++ if (!PageAnon(tree_page)) {
++ unlock_page(tree_page);
++ goto restore_out;
++ }
++
++ if (PageTransCompound(tree_page)) {
++ err = split_huge_page(tree_page);
++ if (err) {
++ unlock_page(tree_page);
++ goto restore_out;
++ }
++ }
++
++ if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) {
++ unlock_page(tree_page);
++ goto restore_out;
++ }
++
++ if (pages_identical_with_cost(page, tree_page)) {
++ err = replace_page(vma2, tree_page, page, wprt_pte2);
++ if (err) {
++ unlock_page(tree_page);
++ goto restore_out;
++ }
++
++ if ((vma2->vm_flags & VM_LOCKED)) {
++ munlock_vma_page(tree_page);
++ if (!PageMlocked(page)) {
++ unlock_page(tree_page);
++ lock_page(page);
++ mlock_vma_page(page);
++ tree_page = page; /* for final unlock */
++ }
++ }
++
++ unlock_page(tree_page);
++
++ goto out; /* success */
++
++ } else {
++ if (tree_rmap_item->hash_max &&
++ tree_rmap_item->hash_max == rmap_item->hash_max) {
++ err = MERGE_ERR_COLLI_MAX;
++ } else if (page_hash(page, hash_strength, 0) ==
++ page_hash(tree_page, hash_strength, 0)) {
++ inc_rshash_neg(memcmp_cost + hash_strength * 2);
++ err = MERGE_ERR_COLLI;
++ } else {
++ err = MERGE_ERR_CHANGED;
++ }
++
++ unlock_page(tree_page);
++ }
++
++restore_out:
++ lock_page(page);
++ if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item),
++ orig_pte1, wprt_pte1))
++ page->mapping = saved_mapping;
++
++out_unlock:
++ unlock_page(page);
++out:
++ return err;
++}
++
++static inline int hash_cmp(u32 new_val, u32 node_val)
++{
++ if (new_val > node_val)
++ return 1;
++ else if (new_val < node_val)
++ return -1;
++ else
++ return 0;
++}
++
++static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash)
++{
++ u32 hash_max = item->hash_max;
++
++ if (!hash_max) {
++ hash_max = page_hash_max(item->page, hash);
++
++ item->hash_max = hash_max;
++ }
++
++ return hash_max;
++}
++
++
++
++/**
++ * stable_tree_search() - search the stable tree for a page
++ *
++ * @item: the rmap_item we are comparing with
++ * @hash: the hash value of this item->page already calculated
++ *
++ * @return the page we have found, NULL otherwise. The page returned has
++ * been gotten.
++ */
++static struct page *stable_tree_search(struct rmap_item *item, u32 hash)
++{
++ struct rb_node *node = root_stable_treep->rb_node;
++ struct tree_node *tree_node;
++ unsigned long hash_max;
++ struct page *page = item->page;
++ struct stable_node *stable_node;
++
++ stable_node = page_stable_node(page);
++ if (stable_node) {
++ /* ksm page forked, that is
++ * if (PageKsm(page) && !in_stable_tree(rmap_item))
++ * it's actually gotten once outside.
++ */
++ get_page(page);
++ return page;
++ }
++
++ while (node) {
++ int cmp;
++
++ tree_node = rb_entry(node, struct tree_node, node);
++
++ cmp = hash_cmp(hash, tree_node->hash);
++
++ if (cmp < 0)
++ node = node->rb_left;
++ else if (cmp > 0)
++ node = node->rb_right;
++ else
++ break;
++ }
++
++ if (!node)
++ return NULL;
++
++ if (tree_node->count == 1) {
++ stable_node = rb_entry(tree_node->sub_root.rb_node,
++ struct stable_node, node);
++ BUG_ON(!stable_node);
++
++ goto get_page_out;
++ }
++
++ /*
++ * ok, we have to search the second
++ * level subtree, hash the page to a
++ * full strength.
++ */
++ node = tree_node->sub_root.rb_node;
++ BUG_ON(!node);
++ hash_max = rmap_item_hash_max(item, hash);
++
++ while (node) {
++ int cmp;
++
++ stable_node = rb_entry(node, struct stable_node, node);
++
++ cmp = hash_cmp(hash_max, stable_node->hash_max);
++
++ if (cmp < 0)
++ node = node->rb_left;
++ else if (cmp > 0)
++ node = node->rb_right;
++ else
++ goto get_page_out;
++ }
++
++ return NULL;
++
++get_page_out:
++ page = get_uksm_page(stable_node, 1, 1);
++ return page;
++}
++
++static int try_merge_rmap_item(struct rmap_item *item,
++ struct page *kpage,
++ struct page *tree_page)
++{
++ struct vm_area_struct *vma = item->slot->vma;
++ struct page_vma_mapped_walk pvmw = {
++ .page = kpage,
++ .vma = vma,
++ };
++
++ pvmw.address = get_rmap_addr(item);
++ if (!page_vma_mapped_walk(&pvmw))
++ return 0;
++
++ if (pte_write(*pvmw.pte)) {
++ /* has changed, abort! */
++ page_vma_mapped_walk_done(&pvmw);
++ return 0;
++ }
++
++ get_page(tree_page);
++ page_add_anon_rmap(tree_page, vma, pvmw.address, false);
++
++ flush_cache_page(vma, pvmw.address, page_to_pfn(kpage));
++ ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte);
++ set_pte_at_notify(vma->vm_mm, pvmw.address, pvmw.pte,
++ mk_pte(tree_page, vma->vm_page_prot));
++
++ page_remove_rmap(kpage, false);
++ put_page(kpage);
++
++ page_vma_mapped_walk_done(&pvmw);
++
++ return 1;
++}
++
++/**
++ * try_to_merge_with_stable_page() - when two rmap_items need to be inserted
++ * into stable tree, the page was found to be identical to a stable ksm page,
++ * this is the last chance we can merge them into one.
++ *
++ * @item1: the rmap_item holding the page which we wanted to insert
++ * into stable tree.
++ * @item2: the other rmap_item we found when unstable tree search
++ * @oldpage: the page currently mapped by the two rmap_items
++ * @tree_page: the page we found identical in stable tree node
++ * @success1: return if item1 is successfully merged
++ * @success2: return if item2 is successfully merged
++ */
++static void try_merge_with_stable(struct rmap_item *item1,
++ struct rmap_item *item2,
++ struct page **kpage,
++ struct page *tree_page,
++ int *success1, int *success2)
++{
++ struct vm_area_struct *vma1 = item1->slot->vma;
++ struct vm_area_struct *vma2 = item2->slot->vma;
++ *success1 = 0;
++ *success2 = 0;
++
++ if (unlikely(*kpage == tree_page)) {
++ /* I don't think this can really happen */
++ pr_warn("UKSM: unexpected condition detected in "
++ "%s -- *kpage == tree_page !\n", __func__);
++ *success1 = 1;
++ *success2 = 1;
++ return;
++ }
++
++ if (!PageAnon(*kpage) || !PageKsm(*kpage))
++ goto failed;
++
++ if (!trylock_page(tree_page))
++ goto failed;
++
++ /* If the oldpage is still ksm and still pointed
++ * to in the right place, and still write protected,
++ * we are confident it's not changed, no need to
++ * memcmp anymore.
++ * be ware, we cannot take nested pte locks,
++ * deadlock risk.
++ */
++ if (!try_merge_rmap_item(item1, *kpage, tree_page))
++ goto unlock_failed;
++
++ /* ok, then vma2, remind that pte1 already set */
++ if (!try_merge_rmap_item(item2, *kpage, tree_page))
++ goto success_1;
++
++ *success2 = 1;
++success_1:
++ *success1 = 1;
++
++
++ if ((*success1 && vma1->vm_flags & VM_LOCKED) ||
++ (*success2 && vma2->vm_flags & VM_LOCKED)) {
++ munlock_vma_page(*kpage);
++ if (!PageMlocked(tree_page))
++ mlock_vma_page(tree_page);
++ }
++
++ /*
++ * We do not need oldpage any more in the caller, so can break the lock
++ * now.
++ */
++ unlock_page(*kpage);
++ *kpage = tree_page; /* Get unlocked outside. */
++ return;
++
++unlock_failed:
++ unlock_page(tree_page);
++failed:
++ return;
++}
++
++static inline void stable_node_hash_max(struct stable_node *node,
++ struct page *page, u32 hash)
++{
++ u32 hash_max = node->hash_max;
++
++ if (!hash_max) {
++ hash_max = page_hash_max(page, hash);
++ node->hash_max = hash_max;
++ }
++}
++
++static inline
++struct stable_node *new_stable_node(struct tree_node *tree_node,
++ struct page *kpage, u32 hash_max)
++{
++ struct stable_node *new_stable_node;
++
++ new_stable_node = alloc_stable_node();
++ if (!new_stable_node)
++ return NULL;
++
++ new_stable_node->kpfn = page_to_pfn(kpage);
++ new_stable_node->hash_max = hash_max;
++ new_stable_node->tree_node = tree_node;
++ set_page_stable_node(kpage, new_stable_node);
++
++ return new_stable_node;
++}
++
++static inline
++struct stable_node *first_level_insert(struct tree_node *tree_node,
++ struct rmap_item *rmap_item,
++ struct rmap_item *tree_rmap_item,
++ struct page **kpage, u32 hash,
++ int *success1, int *success2)
++{
++ int cmp;
++ struct page *tree_page;
++ u32 hash_max = 0;
++ struct stable_node *stable_node, *new_snode;
++ struct rb_node *parent = NULL, **new;
++
++ /* this tree node contains no sub-tree yet */
++ stable_node = rb_entry(tree_node->sub_root.rb_node,
++ struct stable_node, node);
++
++ tree_page = get_uksm_page(stable_node, 1, 0);
++ if (tree_page) {
++ cmp = memcmp_pages_with_cost(*kpage, tree_page, 1);
++ if (!cmp) {
++ try_merge_with_stable(rmap_item, tree_rmap_item, kpage,
++ tree_page, success1, success2);
++ put_page(tree_page);
++ if (!*success1 && !*success2)
++ goto failed;
++
++ return stable_node;
++
++ } else {
++ /*
++ * collision in first level try to create a subtree.
++ * A new node need to be created.
++ */
++ put_page(tree_page);
++
++ stable_node_hash_max(stable_node, tree_page,
++ tree_node->hash);
++ hash_max = rmap_item_hash_max(rmap_item, hash);
++ cmp = hash_cmp(hash_max, stable_node->hash_max);
++
++ parent = &stable_node->node;
++ if (cmp < 0)
++ new = &parent->rb_left;
++ else if (cmp > 0)
++ new = &parent->rb_right;
++ else
++ goto failed;
++ }
++
++ } else {
++ /* the only stable_node deleted, we reuse its tree_node.
++ */
++ parent = NULL;
++ new = &tree_node->sub_root.rb_node;
++ }
++
++ new_snode = new_stable_node(tree_node, *kpage, hash_max);
++ if (!new_snode)
++ goto failed;
++
++ rb_link_node(&new_snode->node, parent, new);
++ rb_insert_color(&new_snode->node, &tree_node->sub_root);
++ tree_node->count++;
++ *success1 = *success2 = 1;
++
++ return new_snode;
++
++failed:
++ return NULL;
++}
++
++static inline
++struct stable_node *stable_subtree_insert(struct tree_node *tree_node,
++ struct rmap_item *rmap_item,
++ struct rmap_item *tree_rmap_item,
++ struct page **kpage, u32 hash,
++ int *success1, int *success2)
++{
++ struct page *tree_page;
++ u32 hash_max;
++ struct stable_node *stable_node, *new_snode;
++ struct rb_node *parent, **new;
++
++research:
++ parent = NULL;
++ new = &tree_node->sub_root.rb_node;
++ BUG_ON(!*new);
++ hash_max = rmap_item_hash_max(rmap_item, hash);
++ while (*new) {
++ int cmp;
++
++ stable_node = rb_entry(*new, struct stable_node, node);
++
++ cmp = hash_cmp(hash_max, stable_node->hash_max);
++
++ if (cmp < 0) {
++ parent = *new;
++ new = &parent->rb_left;
++ } else if (cmp > 0) {
++ parent = *new;
++ new = &parent->rb_right;
++ } else {
++ tree_page = get_uksm_page(stable_node, 1, 0);
++ if (tree_page) {
++ cmp = memcmp_pages_with_cost(*kpage, tree_page, 1);
++ if (!cmp) {
++ try_merge_with_stable(rmap_item,
++ tree_rmap_item, kpage,
++ tree_page, success1, success2);
++
++ put_page(tree_page);
++ if (!*success1 && !*success2)
++ goto failed;
++ /*
++ * successfully merged with a stable
++ * node
++ */
++ return stable_node;
++ } else {
++ put_page(tree_page);
++ goto failed;
++ }
++ } else {
++ /*
++ * stable node may be deleted,
++ * and subtree maybe
++ * restructed, cannot
++ * continue, research it.
++ */
++ if (tree_node->count) {
++ goto research;
++ } else {
++ /* reuse the tree node*/
++ parent = NULL;
++ new = &tree_node->sub_root.rb_node;
++ }
++ }
++ }
++ }
++
++ new_snode = new_stable_node(tree_node, *kpage, hash_max);
++ if (!new_snode)
++ goto failed;
++
++ rb_link_node(&new_snode->node, parent, new);
++ rb_insert_color(&new_snode->node, &tree_node->sub_root);
++ tree_node->count++;
++ *success1 = *success2 = 1;
++
++ return new_snode;
++
++failed:
++ return NULL;
++}
++
++
++/**
++ * stable_tree_insert() - try to insert a merged page in unstable tree to
++ * the stable tree
++ *
++ * @kpage: the page need to be inserted
++ * @hash: the current hash of this page
++ * @rmap_item: the rmap_item being scanned
++ * @tree_rmap_item: the rmap_item found on unstable tree
++ * @success1: return if rmap_item is merged
++ * @success2: return if tree_rmap_item is merged
++ *
++ * @return the stable_node on stable tree if at least one
++ * rmap_item is inserted into stable tree, NULL
++ * otherwise.
++ */
++static struct stable_node *
++stable_tree_insert(struct page **kpage, u32 hash,
++ struct rmap_item *rmap_item,
++ struct rmap_item *tree_rmap_item,
++ int *success1, int *success2)
++{
++ struct rb_node **new = &root_stable_treep->rb_node;
++ struct rb_node *parent = NULL;
++ struct stable_node *stable_node;
++ struct tree_node *tree_node;
++ u32 hash_max = 0;
++
++ *success1 = *success2 = 0;
++
++ while (*new) {
++ int cmp;
++
++ tree_node = rb_entry(*new, struct tree_node, node);
++
++ cmp = hash_cmp(hash, tree_node->hash);
++
++ if (cmp < 0) {
++ parent = *new;
++ new = &parent->rb_left;
++ } else if (cmp > 0) {
++ parent = *new;
++ new = &parent->rb_right;
++ } else
++ break;
++ }
++
++ if (*new) {
++ if (tree_node->count == 1) {
++ stable_node = first_level_insert(tree_node, rmap_item,
++ tree_rmap_item, kpage,
++ hash, success1, success2);
++ } else {
++ stable_node = stable_subtree_insert(tree_node,
++ rmap_item, tree_rmap_item, kpage,
++ hash, success1, success2);
++ }
++ } else {
++
++ /* no tree node found */
++ tree_node = alloc_tree_node(stable_tree_node_listp);
++ if (!tree_node) {
++ stable_node = NULL;
++ goto out;
++ }
++
++ stable_node = new_stable_node(tree_node, *kpage, hash_max);
++ if (!stable_node) {
++ free_tree_node(tree_node);
++ goto out;
++ }
++
++ tree_node->hash = hash;
++ rb_link_node(&tree_node->node, parent, new);
++ rb_insert_color(&tree_node->node, root_stable_treep);
++ parent = NULL;
++ new = &tree_node->sub_root.rb_node;
++
++ rb_link_node(&stable_node->node, parent, new);
++ rb_insert_color(&stable_node->node, &tree_node->sub_root);
++ tree_node->count++;
++ *success1 = *success2 = 1;
++ }
++
++out:
++ return stable_node;
++}
++
++
++/**
++ * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem
++ *
++ * @return 0 on success, -EBUSY if unable to lock the mmap_sem,
++ * -EINVAL if the page mapping has been changed.
++ */
++static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item)
++{
++ int err;
++
++ err = get_mergeable_page_lock_mmap(tree_rmap_item);
++
++ if (err == -EINVAL) {
++ /* its page map has been changed, remove it */
++ remove_rmap_item_from_tree(tree_rmap_item);
++ }
++
++ /* The page is gotten and mmap_sem is locked now. */
++ return err;
++}
++
++
++/**
++ * unstable_tree_search_insert() - search an unstable tree rmap_item with the
++ * same hash value. Get its page and trylock the mmap_sem
++ */
++static inline
++struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
++ u32 hash)
++
++{
++ struct rb_node **new = &root_unstable_tree.rb_node;
++ struct rb_node *parent = NULL;
++ struct tree_node *tree_node;
++ u32 hash_max;
++ struct rmap_item *tree_rmap_item;
++
++ while (*new) {
++ int cmp;
++
++ tree_node = rb_entry(*new, struct tree_node, node);
++
++ cmp = hash_cmp(hash, tree_node->hash);
++
++ if (cmp < 0) {
++ parent = *new;
++ new = &parent->rb_left;
++ } else if (cmp > 0) {
++ parent = *new;
++ new = &parent->rb_right;
++ } else
++ break;
++ }
++
++ if (*new) {
++ /* got the tree_node */
++ if (tree_node->count == 1) {
++ tree_rmap_item = rb_entry(tree_node->sub_root.rb_node,
++ struct rmap_item, node);
++ BUG_ON(!tree_rmap_item);
++
++ goto get_page_out;
++ }
++
++ /* well, search the collision subtree */
++ new = &tree_node->sub_root.rb_node;
++ BUG_ON(!*new);
++ hash_max = rmap_item_hash_max(rmap_item, hash);
++
++ while (*new) {
++ int cmp;
++
++ tree_rmap_item = rb_entry(*new, struct rmap_item,
++ node);
++
++ cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
++ parent = *new;
++ if (cmp < 0)
++ new = &parent->rb_left;
++ else if (cmp > 0)
++ new = &parent->rb_right;
++ else
++ goto get_page_out;
++ }
++ } else {
++ /* alloc a new tree_node */
++ tree_node = alloc_tree_node(&unstable_tree_node_list);
++ if (!tree_node)
++ return NULL;
++
++ tree_node->hash = hash;
++ rb_link_node(&tree_node->node, parent, new);
++ rb_insert_color(&tree_node->node, &root_unstable_tree);
++ parent = NULL;
++ new = &tree_node->sub_root.rb_node;
++ }
++
++ /* did not found even in sub-tree */
++ rmap_item->tree_node = tree_node;
++ rmap_item->address |= UNSTABLE_FLAG;
++ rmap_item->hash_round = uksm_hash_round;
++ rb_link_node(&rmap_item->node, parent, new);
++ rb_insert_color(&rmap_item->node, &tree_node->sub_root);
++
++ uksm_pages_unshared++;
++ return NULL;
++
++get_page_out:
++ if (tree_rmap_item->page == rmap_item->page)
++ return NULL;
++
++ if (get_tree_rmap_item_page(tree_rmap_item))
++ return NULL;
++
++ return tree_rmap_item;
++}
++
++static void hold_anon_vma(struct rmap_item *rmap_item,
++ struct anon_vma *anon_vma)
++{
++ rmap_item->anon_vma = anon_vma;
++ get_anon_vma(anon_vma);
++}
++
++
++/**
++ * stable_tree_append() - append a rmap_item to a stable node. Deduplication
++ * ratio statistics is done in this function.
++ *
++ */
++static void stable_tree_append(struct rmap_item *rmap_item,
++ struct stable_node *stable_node, int logdedup)
++{
++ struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_cont = NULL;
++ unsigned long key = (unsigned long)rmap_item->slot;
++ unsigned long factor = rmap_item->slot->rung->step;
++
++ BUG_ON(!stable_node);
++ rmap_item->address |= STABLE_FLAG;
++
++ if (hlist_empty(&stable_node->hlist)) {
++ uksm_pages_shared++;
++ goto node_vma_new;
++ } else {
++ uksm_pages_sharing++;
++ }
++
++ hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) {
++ if (node_vma->key >= key)
++ break;
++
++ if (logdedup) {
++ node_vma->slot->pages_bemerged += factor;
++ if (list_empty(&node_vma->slot->dedup_list))
++ list_add(&node_vma->slot->dedup_list,
++ &vma_slot_dedup);
++ }
++ }
++
++ if (node_vma) {
++ if (node_vma->key == key) {
++ node_vma_cont = hlist_entry_safe(node_vma->hlist.next, struct node_vma, hlist);
++ goto node_vma_ok;
++ } else if (node_vma->key > key) {
++ node_vma_cont = node_vma;
++ }
++ }
++
++node_vma_new:
++ /* no same vma already in node, alloc a new node_vma */
++ new_node_vma = alloc_node_vma();
++ BUG_ON(!new_node_vma);
++ new_node_vma->head = stable_node;
++ new_node_vma->slot = rmap_item->slot;
++
++ if (!node_vma) {
++ hlist_add_head(&new_node_vma->hlist, &stable_node->hlist);
++ } else if (node_vma->key != key) {
++ if (node_vma->key < key)
++ hlist_add_behind(&new_node_vma->hlist, &node_vma->hlist);
++ else {
++ hlist_add_before(&new_node_vma->hlist,
++ &node_vma->hlist);
++ }
++
++ }
++ node_vma = new_node_vma;
++
++node_vma_ok: /* ok, ready to add to the list */
++ rmap_item->head = node_vma;
++ hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist);
++ hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma);
++ if (logdedup) {
++ rmap_item->slot->pages_merged++;
++ if (node_vma_cont) {
++ node_vma = node_vma_cont;
++ hlist_for_each_entry_continue(node_vma, hlist) {
++ node_vma->slot->pages_bemerged += factor;
++ if (list_empty(&node_vma->slot->dedup_list))
++ list_add(&node_vma->slot->dedup_list,
++ &vma_slot_dedup);
++ }
++ }
++ }
++}
++
++/*
++ * We use break_ksm to break COW on a ksm page: it's a stripped down
++ *
++ * if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1)
++ * put_page(page);
++ *
++ * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
++ * in case the application has unmapped and remapped mm,addr meanwhile.
++ * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
++ * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
++ */
++static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
++{
++ struct page *page;
++ int ret = 0;
++
++ do {
++ cond_resched();
++ page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
++ if (IS_ERR_OR_NULL(page))
++ break;
++ if (PageKsm(page)) {
++ ret = handle_mm_fault(vma, addr,
++ FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE);
++ } else
++ ret = VM_FAULT_WRITE;
++ put_page(page);
++ } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
++ /*
++ * We must loop because handle_mm_fault() may back out if there's
++ * any difficulty e.g. if pte accessed bit gets updated concurrently.
++ *
++ * VM_FAULT_WRITE is what we have been hoping for: it indicates that
++ * COW has been broken, even if the vma does not permit VM_WRITE;
++ * but note that a concurrent fault might break PageKsm for us.
++ *
++ * VM_FAULT_SIGBUS could occur if we race with truncation of the
++ * backing file, which also invalidates anonymous pages: that's
++ * okay, that truncation will have unmapped the PageKsm for us.
++ *
++ * VM_FAULT_OOM: at the time of writing (late July 2009), setting
++ * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
++ * current task has TIF_MEMDIE set, and will be OOM killed on return
++ * to user; and ksmd, having no mm, would never be chosen for that.
++ *
++ * But if the mm is in a limited mem_cgroup, then the fault may fail
++ * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
++ * even ksmd can fail in this way - though it's usually breaking ksm
++ * just to undo a merge it made a moment before, so unlikely to oom.
++ *
++ * That's a pity: we might therefore have more kernel pages allocated
++ * than we're counting as nodes in the stable tree; but uksm_do_scan
++ * will retry to break_cow on each pass, so should recover the page
++ * in due course. The important thing is to not let VM_MERGEABLE
++ * be cleared while any such pages might remain in the area.
++ */
++ return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
++}
++
++static void break_cow(struct rmap_item *rmap_item)
++{
++ struct vm_area_struct *vma = rmap_item->slot->vma;
++ struct mm_struct *mm = vma->vm_mm;
++ unsigned long addr = get_rmap_addr(rmap_item);
++
++ if (uksm_test_exit(mm))
++ goto out;
++
++ break_ksm(vma, addr);
++out:
++ return;
++}
++
++/*
++ * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
++ * than check every pte of a given vma, the locking doesn't quite work for
++ * that - an rmap_item is assigned to the stable tree after inserting ksm
++ * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing
++ * rmap_items from parent to child at fork time (so as not to waste time
++ * if exit comes before the next scan reaches it).
++ *
++ * Similarly, although we'd like to remove rmap_items (so updating counts
++ * and freeing memory) when unmerging an area, it's easier to leave that
++ * to the next pass of ksmd - consider, for example, how ksmd might be
++ * in cmp_and_merge_page on one of the rmap_items we would be removing.
++ */
++inline int unmerge_uksm_pages(struct vm_area_struct *vma,
++ unsigned long start, unsigned long end)
++{
++ unsigned long addr;
++ int err = 0;
++
++ for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
++ if (uksm_test_exit(vma->vm_mm))
++ break;
++ if (signal_pending(current))
++ err = -ERESTARTSYS;
++ else
++ err = break_ksm(vma, addr);
++ }
++ return err;
++}
++
++static inline void inc_uksm_pages_scanned(void)
++{
++ u64 delta;
++
++
++ if (uksm_pages_scanned == U64_MAX) {
++ encode_benefit();
++
++ delta = uksm_pages_scanned >> pages_scanned_base;
++
++ if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) {
++ pages_scanned_stored >>= 1;
++ delta >>= 1;
++ pages_scanned_base++;
++ }
++
++ pages_scanned_stored += delta;
++
++ uksm_pages_scanned = uksm_pages_scanned_last = 0;
++ }
++
++ uksm_pages_scanned++;
++}
++
++static inline int find_zero_page_hash(int strength, u32 hash)
++{
++ return (zero_hash_table[strength] == hash);
++}
++
++static
++int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page)
++{
++ struct page *zero_page = empty_uksm_zero_page;
++ struct mm_struct *mm = vma->vm_mm;
++ pte_t orig_pte = __pte(0);
++ int err = -EFAULT;
++
++ if (uksm_test_exit(mm))
++ goto out;
++
++ if (!trylock_page(page))
++ goto out;
++
++ if (!PageAnon(page))
++ goto out_unlock;
++
++ if (PageTransCompound(page)) {
++ err = split_huge_page(page);
++ if (err)
++ goto out_unlock;
++ }
++
++ if (write_protect_page(vma, page, &orig_pte, 0) == 0) {
++ if (is_page_full_zero(page))
++ err = replace_page(vma, page, zero_page, orig_pte);
++ }
++
++out_unlock:
++ unlock_page(page);
++out:
++ return err;
++}
++
++/*
++ * cmp_and_merge_page() - first see if page can be merged into the stable
++ * tree; if not, compare hash to previous and if it's the same, see if page
++ * can be inserted into the unstable tree, or merged with a page already there
++ * and both transferred to the stable tree.
++ *
++ * @page: the page that we are searching identical page to.
++ * @rmap_item: the reverse mapping into the virtual address of this page
++ */
++static void cmp_and_merge_page(struct rmap_item *rmap_item, u32 hash)
++{
++ struct rmap_item *tree_rmap_item;
++ struct page *page;
++ struct page *kpage = NULL;
++ u32 hash_max;
++ int err;
++ unsigned int success1, success2;
++ struct stable_node *snode;
++ int cmp;
++ struct rb_node *parent = NULL, **new;
++
++ remove_rmap_item_from_tree(rmap_item);
++ page = rmap_item->page;
++
++ /* We first start with searching the page inside the stable tree */
++ kpage = stable_tree_search(rmap_item, hash);
++ if (kpage) {
++ err = try_to_merge_with_uksm_page(rmap_item, kpage,
++ hash);
++ if (!err) {
++ /*
++ * The page was successfully merged, add
++ * its rmap_item to the stable tree.
++ * page lock is needed because it's
++ * racing with try_to_unmap_ksm(), etc.
++ */
++ lock_page(kpage);
++ snode = page_stable_node(kpage);
++ stable_tree_append(rmap_item, snode, 1);
++ unlock_page(kpage);
++ put_page(kpage);
++ return; /* success */
++ }
++ put_page(kpage);
++
++ /*
++ * if it's a collision and it has been search in sub-rbtree
++ * (hash_max != 0), we want to abort, because if it is
++ * successfully merged in unstable tree, the collision trends to
++ * happen again.
++ */
++ if (err == MERGE_ERR_COLLI && rmap_item->hash_max)
++ return;
++ }
++
++ tree_rmap_item =
++ unstable_tree_search_insert(rmap_item, hash);
++ if (tree_rmap_item) {
++ err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash);
++ /*
++ * As soon as we merge this page, we want to remove the
++ * rmap_item of the page we have merged with from the unstable
++ * tree, and insert it instead as new node in the stable tree.
++ */
++ if (!err) {
++ kpage = page;
++ remove_rmap_item_from_tree(tree_rmap_item);
++ lock_page(kpage);
++ snode = stable_tree_insert(&kpage, hash,
++ rmap_item, tree_rmap_item,
++ &success1, &success2);
++
++ /*
++ * Do not log dedup for tree item, it's not counted as
++ * scanned in this round.
++ */
++ if (success2)
++ stable_tree_append(tree_rmap_item, snode, 0);
++
++ /*
++ * The order of these two stable append is important:
++ * we are scanning rmap_item.
++ */
++ if (success1)
++ stable_tree_append(rmap_item, snode, 1);
++
++ /*
++ * The original kpage may be unlocked inside
++ * stable_tree_insert() already. This page
++ * should be unlocked before doing
++ * break_cow().
++ */
++ unlock_page(kpage);
++
++ if (!success1)
++ break_cow(rmap_item);
++
++ if (!success2)
++ break_cow(tree_rmap_item);
++
++ } else if (err == MERGE_ERR_COLLI) {
++ BUG_ON(tree_rmap_item->tree_node->count > 1);
++
++ rmap_item_hash_max(tree_rmap_item,
++ tree_rmap_item->tree_node->hash);
++
++ hash_max = rmap_item_hash_max(rmap_item, hash);
++ cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
++ parent = &tree_rmap_item->node;
++ if (cmp < 0)
++ new = &parent->rb_left;
++ else if (cmp > 0)
++ new = &parent->rb_right;
++ else
++ goto put_up_out;
++
++ rmap_item->tree_node = tree_rmap_item->tree_node;
++ rmap_item->address |= UNSTABLE_FLAG;
++ rmap_item->hash_round = uksm_hash_round;
++ rb_link_node(&rmap_item->node, parent, new);
++ rb_insert_color(&rmap_item->node,
++ &tree_rmap_item->tree_node->sub_root);
++ rmap_item->tree_node->count++;
++ } else {
++ /*
++ * either one of the page has changed or they collide
++ * at the max hash, we consider them as ill items.
++ */
++ remove_rmap_item_from_tree(tree_rmap_item);
++ }
++put_up_out:
++ put_page(tree_rmap_item->page);
++ up_read(&tree_rmap_item->slot->vma->vm_mm->mmap_sem);
++ }
++}
++
++
++
++
++static inline unsigned long get_pool_index(struct vma_slot *slot,
++ unsigned long index)
++{
++ unsigned long pool_index;
++
++ pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT;
++ if (pool_index >= slot->pool_size)
++ BUG();
++ return pool_index;
++}
++
++static inline unsigned long index_page_offset(unsigned long index)
++{
++ return offset_in_page(sizeof(struct rmap_list_entry *) * index);
++}
++
++static inline
++struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot,
++ unsigned long index, int need_alloc)
++{
++ unsigned long pool_index;
++ struct page *page;
++ void *addr;
++
++
++ pool_index = get_pool_index(slot, index);
++ if (!slot->rmap_list_pool[pool_index]) {
++ if (!need_alloc)
++ return NULL;
++
++ page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN);
++ if (!page)
++ return NULL;
++
++ slot->rmap_list_pool[pool_index] = page;
++ }
++
++ addr = kmap(slot->rmap_list_pool[pool_index]);
++ addr += index_page_offset(index);
++
++ return addr;
++}
++
++static inline void put_rmap_list_entry(struct vma_slot *slot,
++ unsigned long index)
++{
++ unsigned long pool_index;
++
++ pool_index = get_pool_index(slot, index);
++ BUG_ON(!slot->rmap_list_pool[pool_index]);
++ kunmap(slot->rmap_list_pool[pool_index]);
++}
++
++static inline int entry_is_new(struct rmap_list_entry *entry)
++{
++ return !entry->item;
++}
++
++static inline unsigned long get_index_orig_addr(struct vma_slot *slot,
++ unsigned long index)
++{
++ return slot->vma->vm_start + (index << PAGE_SHIFT);
++}
++
++static inline unsigned long get_entry_address(struct rmap_list_entry *entry)
++{
++ unsigned long addr;
++
++ if (is_addr(entry->addr))
++ addr = get_clean_addr(entry->addr);
++ else if (entry->item)
++ addr = get_rmap_addr(entry->item);
++ else
++ BUG();
++
++ return addr;
++}
++
++static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry)
++{
++ if (is_addr(entry->addr))
++ return NULL;
++
++ return entry->item;
++}
++
++static inline void inc_rmap_list_pool_count(struct vma_slot *slot,
++ unsigned long index)
++{
++ unsigned long pool_index;
++
++ pool_index = get_pool_index(slot, index);
++ BUG_ON(!slot->rmap_list_pool[pool_index]);
++ slot->pool_counts[pool_index]++;
++}
++
++static inline void dec_rmap_list_pool_count(struct vma_slot *slot,
++ unsigned long index)
++{
++ unsigned long pool_index;
++
++ pool_index = get_pool_index(slot, index);
++ BUG_ON(!slot->rmap_list_pool[pool_index]);
++ BUG_ON(!slot->pool_counts[pool_index]);
++ slot->pool_counts[pool_index]--;
++}
++
++static inline int entry_has_rmap(struct rmap_list_entry *entry)
++{
++ return !is_addr(entry->addr) && entry->item;
++}
++
++static inline void swap_entries(struct rmap_list_entry *entry1,
++ unsigned long index1,
++ struct rmap_list_entry *entry2,
++ unsigned long index2)
++{
++ struct rmap_list_entry tmp;
++
++ /* swapping two new entries is meaningless */
++ BUG_ON(entry_is_new(entry1) && entry_is_new(entry2));
++
++ tmp = *entry1;
++ *entry1 = *entry2;
++ *entry2 = tmp;
++
++ if (entry_has_rmap(entry1))
++ entry1->item->entry_index = index1;
++
++ if (entry_has_rmap(entry2))
++ entry2->item->entry_index = index2;
++
++ if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) {
++ inc_rmap_list_pool_count(entry1->item->slot, index1);
++ dec_rmap_list_pool_count(entry1->item->slot, index2);
++ } else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) {
++ inc_rmap_list_pool_count(entry2->item->slot, index2);
++ dec_rmap_list_pool_count(entry2->item->slot, index1);
++ }
++}
++
++static inline void free_entry_item(struct rmap_list_entry *entry)
++{
++ unsigned long index;
++ struct rmap_item *item;
++
++ if (!is_addr(entry->addr)) {
++ BUG_ON(!entry->item);
++ item = entry->item;
++ entry->addr = get_rmap_addr(item);
++ set_is_addr(entry->addr);
++ index = item->entry_index;
++ remove_rmap_item_from_tree(item);
++ dec_rmap_list_pool_count(item->slot, index);
++ free_rmap_item(item);
++ }
++}
++
++static inline int pool_entry_boundary(unsigned long index)
++{
++ unsigned long linear_addr;
++
++ linear_addr = sizeof(struct rmap_list_entry *) * index;
++ return index && !offset_in_page(linear_addr);
++}
++
++static inline void try_free_last_pool(struct vma_slot *slot,
++ unsigned long index)
++{
++ unsigned long pool_index;
++
++ pool_index = get_pool_index(slot, index);
++ if (slot->rmap_list_pool[pool_index] &&
++ !slot->pool_counts[pool_index]) {
++ __free_page(slot->rmap_list_pool[pool_index]);
++ slot->rmap_list_pool[pool_index] = NULL;
++ slot->flags |= UKSM_SLOT_NEED_SORT;
++ }
++
++}
++
++static inline unsigned long vma_item_index(struct vm_area_struct *vma,
++ struct rmap_item *item)
++{
++ return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT;
++}
++
++static int within_same_pool(struct vma_slot *slot,
++ unsigned long i, unsigned long j)
++{
++ unsigned long pool_i, pool_j;
++
++ pool_i = get_pool_index(slot, i);
++ pool_j = get_pool_index(slot, j);
++
++ return (pool_i == pool_j);
++}
++
++static void sort_rmap_entry_list(struct vma_slot *slot)
++{
++ unsigned long i, j;
++ struct rmap_list_entry *entry, *swap_entry;
++
++ entry = get_rmap_list_entry(slot, 0, 0);
++ for (i = 0; i < slot->pages; ) {
++
++ if (!entry)
++ goto skip_whole_pool;
++
++ if (entry_is_new(entry))
++ goto next_entry;
++
++ if (is_addr(entry->addr)) {
++ entry->addr = 0;
++ goto next_entry;
++ }
++
++ j = vma_item_index(slot->vma, entry->item);
++ if (j == i)
++ goto next_entry;
++
++ if (within_same_pool(slot, i, j))
++ swap_entry = entry + j - i;
++ else
++ swap_entry = get_rmap_list_entry(slot, j, 1);
++
++ swap_entries(entry, i, swap_entry, j);
++ if (!within_same_pool(slot, i, j))
++ put_rmap_list_entry(slot, j);
++ continue;
++
++skip_whole_pool:
++ i += PAGE_SIZE / sizeof(*entry);
++ if (i < slot->pages)
++ entry = get_rmap_list_entry(slot, i, 0);
++ continue;
++
++next_entry:
++ if (i >= slot->pages - 1 ||
++ !within_same_pool(slot, i, i + 1)) {
++ put_rmap_list_entry(slot, i);
++ if (i + 1 < slot->pages)
++ entry = get_rmap_list_entry(slot, i + 1, 0);
++ } else
++ entry++;
++ i++;
++ continue;
++ }
++
++ /* free empty pool entries which contain no rmap_item */
++ /* CAN be simplied to based on only pool_counts when bug freed !!!!! */
++ for (i = 0; i < slot->pool_size; i++) {
++ unsigned char has_rmap;
++ void *addr;
++
++ if (!slot->rmap_list_pool[i])
++ continue;
++
++ has_rmap = 0;
++ addr = kmap(slot->rmap_list_pool[i]);
++ BUG_ON(!addr);
++ for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
++ entry = (struct rmap_list_entry *)addr + j;
++ if (is_addr(entry->addr))
++ continue;
++ if (!entry->item)
++ continue;
++ has_rmap = 1;
++ }
++ kunmap(slot->rmap_list_pool[i]);
++ if (!has_rmap) {
++ BUG_ON(slot->pool_counts[i]);
++ __free_page(slot->rmap_list_pool[i]);
++ slot->rmap_list_pool[i] = NULL;
++ }
++ }
++
++ slot->flags &= ~UKSM_SLOT_NEED_SORT;
++}
++
++/*
++ * vma_fully_scanned() - if all the pages in this slot have been scanned.
++ */
++static inline int vma_fully_scanned(struct vma_slot *slot)
++{
++ return slot->pages_scanned == slot->pages;
++}
++
++/**
++ * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to
++ * its random permutation. This function is embedded with the random
++ * permutation index management code.
++ */
++static struct rmap_item *get_next_rmap_item(struct vma_slot *slot, u32 *hash)
++{
++ unsigned long rand_range, addr, swap_index, scan_index;
++ struct rmap_item *item = NULL;
++ struct rmap_list_entry *scan_entry, *swap_entry = NULL;
++ struct page *page;
++
++ scan_index = swap_index = slot->pages_scanned % slot->pages;
++
++ if (pool_entry_boundary(scan_index))
++ try_free_last_pool(slot, scan_index - 1);
++
++ if (vma_fully_scanned(slot)) {
++ if (slot->flags & UKSM_SLOT_NEED_SORT)
++ slot->flags |= UKSM_SLOT_NEED_RERAND;
++ else
++ slot->flags &= ~UKSM_SLOT_NEED_RERAND;
++ if (slot->flags & UKSM_SLOT_NEED_SORT)
++ sort_rmap_entry_list(slot);
++ }
++
++ scan_entry = get_rmap_list_entry(slot, scan_index, 1);
++ if (!scan_entry)
++ return NULL;
++
++ if (entry_is_new(scan_entry)) {
++ scan_entry->addr = get_index_orig_addr(slot, scan_index);
++ set_is_addr(scan_entry->addr);
++ }
++
++ if (slot->flags & UKSM_SLOT_NEED_RERAND) {
++ rand_range = slot->pages - scan_index;
++ BUG_ON(!rand_range);
++ swap_index = scan_index + (prandom_u32() % rand_range);
++ }
++
++ if (swap_index != scan_index) {
++ swap_entry = get_rmap_list_entry(slot, swap_index, 1);
++
++ if (!swap_entry)
++ return NULL;
++
++ if (entry_is_new(swap_entry)) {
++ swap_entry->addr = get_index_orig_addr(slot,
++ swap_index);
++ set_is_addr(swap_entry->addr);
++ }
++ swap_entries(scan_entry, scan_index, swap_entry, swap_index);
++ }
++
++ addr = get_entry_address(scan_entry);
++ item = get_entry_item(scan_entry);
++ BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start);
++
++ page = follow_page(slot->vma, addr, FOLL_GET);
++ if (IS_ERR_OR_NULL(page))
++ goto nopage;
++
++ if (!PageAnon(page))
++ goto putpage;
++
++ /*check is zero_page pfn or uksm_zero_page*/
++ if ((page_to_pfn(page) == zero_pfn)
++ || (page_to_pfn(page) == uksm_zero_pfn))
++ goto putpage;
++
++ flush_anon_page(slot->vma, page, addr);
++ flush_dcache_page(page);
++
++
++ *hash = page_hash(page, hash_strength, 1);
++ inc_uksm_pages_scanned();
++ /*if the page content all zero, re-map to zero-page*/
++ if (find_zero_page_hash(hash_strength, *hash)) {
++ if (!cmp_and_merge_zero_page(slot->vma, page)) {
++ slot->pages_merged++;
++
++ /* For full-zero pages, no need to create rmap item */
++ goto putpage;
++ } else {
++ inc_rshash_neg(memcmp_cost / 2);
++ }
++ }
++
++ if (!item) {
++ item = alloc_rmap_item();
++ if (item) {
++ /* It has already been zeroed */
++ item->slot = slot;
++ item->address = addr;
++ item->entry_index = scan_index;
++ scan_entry->item = item;
++ inc_rmap_list_pool_count(slot, scan_index);
++ } else
++ goto putpage;
++ }
++
++ BUG_ON(item->slot != slot);
++ /* the page may have changed */
++ item->page = page;
++ put_rmap_list_entry(slot, scan_index);
++ if (swap_entry)
++ put_rmap_list_entry(slot, swap_index);
++ return item;
++
++putpage:
++ put_page(page);
++ page = NULL;
++nopage:
++ /* no page, store addr back and free rmap_item if possible */
++ free_entry_item(scan_entry);
++ put_rmap_list_entry(slot, scan_index);
++ if (swap_entry)
++ put_rmap_list_entry(slot, swap_index);
++ return NULL;
++}
++
++static inline int in_stable_tree(struct rmap_item *rmap_item)
++{
++ return rmap_item->address & STABLE_FLAG;
++}
++
++/**
++ * scan_vma_one_page() - scan the next page in a vma_slot. Called with
++ * mmap_sem locked.
++ */
++static noinline void scan_vma_one_page(struct vma_slot *slot)
++{
++ u32 hash;
++ struct mm_struct *mm;
++ struct rmap_item *rmap_item = NULL;
++ struct vm_area_struct *vma = slot->vma;
++
++ mm = vma->vm_mm;
++ BUG_ON(!mm);
++ BUG_ON(!slot);
++
++ rmap_item = get_next_rmap_item(slot, &hash);
++ if (!rmap_item)
++ goto out1;
++
++ if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item))
++ goto out2;
++
++ cmp_and_merge_page(rmap_item, hash);
++out2:
++ put_page(rmap_item->page);
++out1:
++ slot->pages_scanned++;
++ slot->this_sampled++;
++ if (slot->fully_scanned_round != fully_scanned_round)
++ scanned_virtual_pages++;
++
++ if (vma_fully_scanned(slot))
++ slot->fully_scanned_round = fully_scanned_round;
++}
++
++static inline unsigned long rung_get_pages(struct scan_rung *rung)
++{
++ struct slot_tree_node *node;
++
++ if (!rung->vma_root.rnode)
++ return 0;
++
++ node = container_of(rung->vma_root.rnode, struct slot_tree_node, snode);
++
++ return node->size;
++}
++
++#define RUNG_SAMPLED_MIN 3
++
++static inline
++void uksm_calc_rung_step(struct scan_rung *rung,
++ unsigned long page_time, unsigned long ratio)
++{
++ unsigned long sampled, pages;
++
++ /* will be fully scanned ? */
++ if (!rung->cover_msecs) {
++ rung->step = 1;
++ return;
++ }
++
++ sampled = rung->cover_msecs * (NSEC_PER_MSEC / TIME_RATIO_SCALE)
++ * ratio / page_time;
++
++ /*
++ * Before we finsish a scan round and expensive per-round jobs,
++ * we need to have a chance to estimate the per page time. So
++ * the sampled number can not be too small.
++ */
++ if (sampled < RUNG_SAMPLED_MIN)
++ sampled = RUNG_SAMPLED_MIN;
++
++ pages = rung_get_pages(rung);
++ if (likely(pages > sampled))
++ rung->step = pages / sampled;
++ else
++ rung->step = 1;
++}
++
++static inline int step_need_recalc(struct scan_rung *rung)
++{
++ unsigned long pages, stepmax;
++
++ pages = rung_get_pages(rung);
++ stepmax = pages / RUNG_SAMPLED_MIN;
++
++ return pages && (rung->step > pages ||
++ (stepmax && rung->step > stepmax));
++}
++
++static inline
++void reset_current_scan(struct scan_rung *rung, int finished, int step_recalc)
++{
++ struct vma_slot *slot;
++
++ if (finished)
++ rung->flags |= UKSM_RUNG_ROUND_FINISHED;
++
++ if (step_recalc || step_need_recalc(rung)) {
++ uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
++ BUG_ON(step_need_recalc(rung));
++ }
++
++ slot_iter_index = prandom_u32() % rung->step;
++ BUG_ON(!rung->vma_root.rnode);
++ slot = sradix_tree_next(&rung->vma_root, NULL, 0, slot_iter);
++ BUG_ON(!slot);
++
++ rung->current_scan = slot;
++ rung->current_offset = slot_iter_index;
++}
++
++static inline struct sradix_tree_root *slot_get_root(struct vma_slot *slot)
++{
++ return &slot->rung->vma_root;
++}
++
++/*
++ * return if resetted.
++ */
++static int advance_current_scan(struct scan_rung *rung)
++{
++ unsigned short n;
++ struct vma_slot *slot, *next = NULL;
++
++ BUG_ON(!rung->vma_root.num);
++
++ slot = rung->current_scan;
++ n = (slot->pages - rung->current_offset) % rung->step;
++ slot_iter_index = rung->step - n;
++ next = sradix_tree_next(&rung->vma_root, slot->snode,
++ slot->sindex, slot_iter);
++
++ if (next) {
++ rung->current_offset = slot_iter_index;
++ rung->current_scan = next;
++ return 0;
++ } else {
++ reset_current_scan(rung, 1, 0);
++ return 1;
++ }
++}
++
++static inline void rung_rm_slot(struct vma_slot *slot)
++{
++ struct scan_rung *rung = slot->rung;
++ struct sradix_tree_root *root;
++
++ if (rung->current_scan == slot)
++ advance_current_scan(rung);
++
++ root = slot_get_root(slot);
++ sradix_tree_delete_from_leaf(root, slot->snode, slot->sindex);
++ slot->snode = NULL;
++ if (step_need_recalc(rung)) {
++ uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
++ BUG_ON(step_need_recalc(rung));
++ }
++
++ /* In case advance_current_scan loop back to this slot again */
++ if (rung->vma_root.num && rung->current_scan == slot)
++ reset_current_scan(slot->rung, 1, 0);
++}
++
++static inline void rung_add_new_slots(struct scan_rung *rung,
++ struct vma_slot **slots, unsigned long num)
++{
++ int err;
++ struct vma_slot *slot;
++ unsigned long i;
++ struct sradix_tree_root *root = &rung->vma_root;
++
++ err = sradix_tree_enter(root, (void **)slots, num);
++ BUG_ON(err);
++
++ for (i = 0; i < num; i++) {
++ slot = slots[i];
++ slot->rung = rung;
++ BUG_ON(vma_fully_scanned(slot));
++ }
++
++ if (rung->vma_root.num == num)
++ reset_current_scan(rung, 0, 1);
++}
++
++static inline int rung_add_one_slot(struct scan_rung *rung,
++ struct vma_slot *slot)
++{
++ int err;
++
++ err = sradix_tree_enter(&rung->vma_root, (void **)&slot, 1);
++ if (err)
++ return err;
++
++ slot->rung = rung;
++ if (rung->vma_root.num == 1)
++ reset_current_scan(rung, 0, 1);
++
++ return 0;
++}
++
++/*
++ * Return true if the slot is deleted from its rung.
++ */
++static inline int vma_rung_enter(struct vma_slot *slot, struct scan_rung *rung)
++{
++ struct scan_rung *old_rung = slot->rung;
++ int err;
++
++ if (old_rung == rung)
++ return 0;
++
++ rung_rm_slot(slot);
++ err = rung_add_one_slot(rung, slot);
++ if (err) {
++ err = rung_add_one_slot(old_rung, slot);
++ WARN_ON(err); /* OOPS, badly OOM, we lost this slot */
++ }
++
++ return 1;
++}
++
++static inline int vma_rung_up(struct vma_slot *slot)
++{
++ struct scan_rung *rung;
++
++ rung = slot->rung;
++ if (slot->rung != &uksm_scan_ladder[SCAN_LADDER_SIZE-1])
++ rung++;
++
++ return vma_rung_enter(slot, rung);
++}
++
++static inline int vma_rung_down(struct vma_slot *slot)
++{
++ struct scan_rung *rung;
++
++ rung = slot->rung;
++ if (slot->rung != &uksm_scan_ladder[0])
++ rung--;
++
++ return vma_rung_enter(slot, rung);
++}
++
++/**
++ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
++ */
++static unsigned long cal_dedup_ratio(struct vma_slot *slot)
++{
++ unsigned long ret;
++ unsigned long pages;
++
++ pages = slot->this_sampled;
++ if (!pages)
++ return 0;
++
++ BUG_ON(slot->pages_scanned == slot->last_scanned);
++
++ ret = slot->pages_merged;
++
++ /* Thrashing area filtering */
++ if (ret && uksm_thrash_threshold) {
++ if (slot->pages_cowed * 100 / slot->pages_merged
++ > uksm_thrash_threshold) {
++ ret = 0;
++ } else {
++ ret = slot->pages_merged - slot->pages_cowed;
++ }
++ }
++
++ return ret * 100 / pages;
++}
++
++/**
++ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
++ */
++static unsigned long cal_dedup_ratio_old(struct vma_slot *slot)
++{
++ unsigned long ret;
++ unsigned long pages;
++
++ pages = slot->pages;
++ if (!pages)
++ return 0;
++
++ ret = slot->pages_bemerged;
++
++ /* Thrashing area filtering */
++ if (ret && uksm_thrash_threshold) {
++ if (slot->pages_cowed * 100 / slot->pages_bemerged
++ > uksm_thrash_threshold) {
++ ret = 0;
++ } else {
++ ret = slot->pages_bemerged - slot->pages_cowed;
++ }
++ }
++
++ return ret * 100 / pages;
++}
++
++/**
++ * stable_node_reinsert() - When the hash_strength has been adjusted, the
++ * stable tree need to be restructured, this is the function re-inserting the
++ * stable node.
++ */
++static inline void stable_node_reinsert(struct stable_node *new_node,
++ struct page *page,
++ struct rb_root *root_treep,
++ struct list_head *tree_node_listp,
++ u32 hash)
++{
++ struct rb_node **new = &root_treep->rb_node;
++ struct rb_node *parent = NULL;
++ struct stable_node *stable_node;
++ struct tree_node *tree_node;
++ struct page *tree_page;
++ int cmp;
++
++ while (*new) {
++ int cmp;
++
++ tree_node = rb_entry(*new, struct tree_node, node);
++
++ cmp = hash_cmp(hash, tree_node->hash);
++
++ if (cmp < 0) {
++ parent = *new;
++ new = &parent->rb_left;
++ } else if (cmp > 0) {
++ parent = *new;
++ new = &parent->rb_right;
++ } else
++ break;
++ }
++
++ if (*new) {
++ /* find a stable tree node with same first level hash value */
++ stable_node_hash_max(new_node, page, hash);
++ if (tree_node->count == 1) {
++ stable_node = rb_entry(tree_node->sub_root.rb_node,
++ struct stable_node, node);
++ tree_page = get_uksm_page(stable_node, 1, 0);
++ if (tree_page) {
++ stable_node_hash_max(stable_node,
++ tree_page, hash);
++ put_page(tree_page);
++
++ /* prepare for stable node insertion */
++
++ cmp = hash_cmp(new_node->hash_max,
++ stable_node->hash_max);
++ parent = &stable_node->node;
++ if (cmp < 0)
++ new = &parent->rb_left;
++ else if (cmp > 0)
++ new = &parent->rb_right;
++ else
++ goto failed;
++
++ goto add_node;
++ } else {
++ /* the only stable_node deleted, the tree node
++ * was not deleted.
++ */
++ goto tree_node_reuse;
++ }
++ }
++
++ /* well, search the collision subtree */
++ new = &tree_node->sub_root.rb_node;
++ parent = NULL;
++ BUG_ON(!*new);
++ while (*new) {
++ int cmp;
++
++ stable_node = rb_entry(*new, struct stable_node, node);
++
++ cmp = hash_cmp(new_node->hash_max,
++ stable_node->hash_max);
++
++ if (cmp < 0) {
++ parent = *new;
++ new = &parent->rb_left;
++ } else if (cmp > 0) {
++ parent = *new;
++ new = &parent->rb_right;
++ } else {
++ /* oh, no, still a collision */
++ goto failed;
++ }
++ }
++
++ goto add_node;
++ }
++
++ /* no tree node found */
++ tree_node = alloc_tree_node(tree_node_listp);
++ if (!tree_node) {
++ pr_err("UKSM: memory allocation error!\n");
++ goto failed;
++ } else {
++ tree_node->hash = hash;
++ rb_link_node(&tree_node->node, parent, new);
++ rb_insert_color(&tree_node->node, root_treep);
++
++tree_node_reuse:
++ /* prepare for stable node insertion */
++ parent = NULL;
++ new = &tree_node->sub_root.rb_node;
++ }
++
++add_node:
++ rb_link_node(&new_node->node, parent, new);
++ rb_insert_color(&new_node->node, &tree_node->sub_root);
++ new_node->tree_node = tree_node;
++ tree_node->count++;
++ return;
++
++failed:
++ /* This can only happen when two nodes have collided
++ * in two levels.
++ */
++ new_node->tree_node = NULL;
++ return;
++}
++
++static inline void free_all_tree_nodes(struct list_head *list)
++{
++ struct tree_node *node, *tmp;
++
++ list_for_each_entry_safe(node, tmp, list, all_list) {
++ free_tree_node(node);
++ }
++}
++
++/**
++ * stable_tree_delta_hash() - Delta hash the stable tree from previous hash
++ * strength to the current hash_strength. It re-structures the hole tree.
++ */
++static inline void stable_tree_delta_hash(u32 prev_hash_strength)
++{
++ struct stable_node *node, *tmp;
++ struct rb_root *root_new_treep;
++ struct list_head *new_tree_node_listp;
++
++ stable_tree_index = (stable_tree_index + 1) % 2;
++ root_new_treep = &root_stable_tree[stable_tree_index];
++ new_tree_node_listp = &stable_tree_node_list[stable_tree_index];
++ *root_new_treep = RB_ROOT;
++ BUG_ON(!list_empty(new_tree_node_listp));
++
++ /*
++ * we need to be safe, the node could be removed by get_uksm_page()
++ */
++ list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) {
++ void *addr;
++ struct page *node_page;
++ u32 hash;
++
++ /*
++ * We are completely re-structuring the stable nodes to a new
++ * stable tree. We don't want to touch the old tree unlinks and
++ * old tree_nodes. The old tree_nodes will be freed at once.
++ */
++ node_page = get_uksm_page(node, 0, 0);
++ if (!node_page)
++ continue;
++
++ if (node->tree_node) {
++ hash = node->tree_node->hash;
++
++ addr = kmap_atomic(node_page);
++
++ hash = delta_hash(addr, prev_hash_strength,
++ hash_strength, hash);
++ kunmap_atomic(addr);
++ } else {
++ /*
++ *it was not inserted to rbtree due to collision in last
++ *round scan.
++ */
++ hash = page_hash(node_page, hash_strength, 0);
++ }
++
++ stable_node_reinsert(node, node_page, root_new_treep,
++ new_tree_node_listp, hash);
++ put_page(node_page);
++ }
++
++ root_stable_treep = root_new_treep;
++ free_all_tree_nodes(stable_tree_node_listp);
++ BUG_ON(!list_empty(stable_tree_node_listp));
++ stable_tree_node_listp = new_tree_node_listp;
++}
++
++static inline void inc_hash_strength(unsigned long delta)
++{
++ hash_strength += 1 << delta;
++ if (hash_strength > HASH_STRENGTH_MAX)
++ hash_strength = HASH_STRENGTH_MAX;
++}
++
++static inline void dec_hash_strength(unsigned long delta)
++{
++ unsigned long change = 1 << delta;
++
++ if (hash_strength <= change + 1)
++ hash_strength = 1;
++ else
++ hash_strength -= change;
++}
++
++static inline void inc_hash_strength_delta(void)
++{
++ hash_strength_delta++;
++ if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX)
++ hash_strength_delta = HASH_STRENGTH_DELTA_MAX;
++}
++
++static inline unsigned long get_current_neg_ratio(void)
++{
++ u64 pos = benefit.pos;
++ u64 neg = benefit.neg;
++
++ if (!neg)
++ return 0;
++
++ if (!pos || neg > pos)
++ return 100;
++
++ if (neg > div64_u64(U64_MAX, 100))
++ pos = div64_u64(pos, 100);
++ else
++ neg *= 100;
++
++ return div64_u64(neg, pos);
++}
++
++static inline unsigned long get_current_benefit(void)
++{
++ u64 pos = benefit.pos;
++ u64 neg = benefit.neg;
++ u64 scanned = benefit.scanned;
++
++ if (neg > pos)
++ return 0;
++
++ return div64_u64((pos - neg), scanned);
++}
++
++static inline int judge_rshash_direction(void)
++{
++ u64 current_neg_ratio, stable_benefit;
++ u64 current_benefit, delta = 0;
++ int ret = STILL;
++
++ /*
++ * Try to probe a value after the boot, and in case the system
++ * are still for a long time.
++ */
++ if ((fully_scanned_round & 0xFFULL) == 10) {
++ ret = OBSCURE;
++ goto out;
++ }
++
++ current_neg_ratio = get_current_neg_ratio();
++
++ if (current_neg_ratio == 0) {
++ rshash_neg_cont_zero++;
++ if (rshash_neg_cont_zero > 2)
++ return GO_DOWN;
++ else
++ return STILL;
++ }
++ rshash_neg_cont_zero = 0;
++
++ if (current_neg_ratio > 90) {
++ ret = GO_UP;
++ goto out;
++ }
++
++ current_benefit = get_current_benefit();
++ stable_benefit = rshash_state.stable_benefit;
++
++ if (!stable_benefit) {
++ ret = OBSCURE;
++ goto out;
++ }
++
++ if (current_benefit > stable_benefit)
++ delta = current_benefit - stable_benefit;
++ else if (current_benefit < stable_benefit)
++ delta = stable_benefit - current_benefit;
++
++ delta = div64_u64(100 * delta, stable_benefit);
++
++ if (delta > 50) {
++ rshash_cont_obscure++;
++ if (rshash_cont_obscure > 2)
++ return OBSCURE;
++ else
++ return STILL;
++ }
++
++out:
++ rshash_cont_obscure = 0;
++ return ret;
++}
++
++/**
++ * rshash_adjust() - The main function to control the random sampling state
++ * machine for hash strength adapting.
++ *
++ * return true if hash_strength has changed.
++ */
++static inline int rshash_adjust(void)
++{
++ unsigned long prev_hash_strength = hash_strength;
++
++ if (!encode_benefit())
++ return 0;
++
++ switch (rshash_state.state) {
++ case RSHASH_STILL:
++ switch (judge_rshash_direction()) {
++ case GO_UP:
++ if (rshash_state.pre_direct == GO_DOWN)
++ hash_strength_delta = 0;
++
++ inc_hash_strength(hash_strength_delta);
++ inc_hash_strength_delta();
++ rshash_state.stable_benefit = get_current_benefit();
++ rshash_state.pre_direct = GO_UP;
++ break;
++
++ case GO_DOWN:
++ if (rshash_state.pre_direct == GO_UP)
++ hash_strength_delta = 0;
++
++ dec_hash_strength(hash_strength_delta);
++ inc_hash_strength_delta();
++ rshash_state.stable_benefit = get_current_benefit();
++ rshash_state.pre_direct = GO_DOWN;
++ break;
++
++ case OBSCURE:
++ rshash_state.stable_point = hash_strength;
++ rshash_state.turn_point_down = hash_strength;
++ rshash_state.turn_point_up = hash_strength;
++ rshash_state.turn_benefit_down = get_current_benefit();
++ rshash_state.turn_benefit_up = get_current_benefit();
++ rshash_state.lookup_window_index = 0;
++ rshash_state.state = RSHASH_TRYDOWN;
++ dec_hash_strength(hash_strength_delta);
++ inc_hash_strength_delta();
++ break;
++
++ case STILL:
++ break;
++ default:
++ BUG();
++ }
++ break;
++
++ case RSHASH_TRYDOWN:
++ if (rshash_state.lookup_window_index++ % 5 == 0)
++ rshash_state.below_count = 0;
++
++ if (get_current_benefit() < rshash_state.stable_benefit)
++ rshash_state.below_count++;
++ else if (get_current_benefit() >
++ rshash_state.turn_benefit_down) {
++ rshash_state.turn_point_down = hash_strength;
++ rshash_state.turn_benefit_down = get_current_benefit();
++ }
++
++ if (rshash_state.below_count >= 3 ||
++ judge_rshash_direction() == GO_UP ||
++ hash_strength == 1) {
++ hash_strength = rshash_state.stable_point;
++ hash_strength_delta = 0;
++ inc_hash_strength(hash_strength_delta);
++ inc_hash_strength_delta();
++ rshash_state.lookup_window_index = 0;
++ rshash_state.state = RSHASH_TRYUP;
++ hash_strength_delta = 0;
++ } else {
++ dec_hash_strength(hash_strength_delta);
++ inc_hash_strength_delta();
++ }
++ break;
++
++ case RSHASH_TRYUP:
++ if (rshash_state.lookup_window_index++ % 5 == 0)
++ rshash_state.below_count = 0;
++
++ if (get_current_benefit() < rshash_state.turn_benefit_down)
++ rshash_state.below_count++;
++ else if (get_current_benefit() > rshash_state.turn_benefit_up) {
++ rshash_state.turn_point_up = hash_strength;
++ rshash_state.turn_benefit_up = get_current_benefit();
++ }
++
++ if (rshash_state.below_count >= 3 ||
++ judge_rshash_direction() == GO_DOWN ||
++ hash_strength == HASH_STRENGTH_MAX) {
++ hash_strength = rshash_state.turn_benefit_up >
++ rshash_state.turn_benefit_down ?
++ rshash_state.turn_point_up :
++ rshash_state.turn_point_down;
++
++ rshash_state.state = RSHASH_PRE_STILL;
++ } else {
++ inc_hash_strength(hash_strength_delta);
++ inc_hash_strength_delta();
++ }
++
++ break;
++
++ case RSHASH_NEW:
++ case RSHASH_PRE_STILL:
++ rshash_state.stable_benefit = get_current_benefit();
++ rshash_state.state = RSHASH_STILL;
++ hash_strength_delta = 0;
++ break;
++ default:
++ BUG();
++ }
++
++ /* rshash_neg = rshash_pos = 0; */
++ reset_benefit();
++
++ if (prev_hash_strength != hash_strength)
++ stable_tree_delta_hash(prev_hash_strength);
++
++ return prev_hash_strength != hash_strength;
++}
++
++/**
++ * round_update_ladder() - The main function to do update of all the
++ * adjustments whenever a scan round is finished.
++ */
++static noinline void round_update_ladder(void)
++{
++ int i;
++ unsigned long dedup;
++ struct vma_slot *slot, *tmp_slot;
++
++ for (i = 0; i < SCAN_LADDER_SIZE; i++)
++ uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED;
++
++ list_for_each_entry_safe(slot, tmp_slot, &vma_slot_dedup, dedup_list) {
++
++ /* slot may be rung_rm_slot() when mm exits */
++ if (slot->snode) {
++ dedup = cal_dedup_ratio_old(slot);
++ if (dedup && dedup >= uksm_abundant_threshold)
++ vma_rung_up(slot);
++ }
++
++ slot->pages_bemerged = 0;
++ slot->pages_cowed = 0;
++
++ list_del_init(&slot->dedup_list);
++ }
++}
++
++static void uksm_del_vma_slot(struct vma_slot *slot)
++{
++ int i, j;
++ struct rmap_list_entry *entry;
++
++ if (slot->snode) {
++ /*
++ * In case it just failed when entering the rung, it's not
++ * necessary.
++ */
++ rung_rm_slot(slot);
++ }
++
++ if (!list_empty(&slot->dedup_list))
++ list_del(&slot->dedup_list);
++
++ if (!slot->rmap_list_pool || !slot->pool_counts) {
++ /* In case it OOMed in uksm_vma_enter() */
++ goto out;
++ }
++
++ for (i = 0; i < slot->pool_size; i++) {
++ void *addr;
++
++ if (!slot->rmap_list_pool[i])
++ continue;
++
++ addr = kmap(slot->rmap_list_pool[i]);
++ for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
++ entry = (struct rmap_list_entry *)addr + j;
++ if (is_addr(entry->addr))
++ continue;
++ if (!entry->item)
++ continue;
++
++ remove_rmap_item_from_tree(entry->item);
++ free_rmap_item(entry->item);
++ slot->pool_counts[i]--;
++ }
++ BUG_ON(slot->pool_counts[i]);
++ kunmap(slot->rmap_list_pool[i]);
++ __free_page(slot->rmap_list_pool[i]);
++ }
++ kfree(slot->rmap_list_pool);
++ kfree(slot->pool_counts);
++
++out:
++ slot->rung = NULL;
++ if (slot->flags & UKSM_SLOT_IN_UKSM) {
++ BUG_ON(uksm_pages_total < slot->pages);
++ uksm_pages_total -= slot->pages;
++ }
++
++ if (slot->fully_scanned_round == fully_scanned_round)
++ scanned_virtual_pages -= slot->pages;
++ else
++ scanned_virtual_pages -= slot->pages_scanned;
++ free_vma_slot(slot);
++}
++
++
++#define SPIN_LOCK_PERIOD 32
++static struct vma_slot *cleanup_slots[SPIN_LOCK_PERIOD];
++static inline void cleanup_vma_slots(void)
++{
++ struct vma_slot *slot;
++ int i;
++
++ i = 0;
++ spin_lock(&vma_slot_list_lock);
++ while (!list_empty(&vma_slot_del)) {
++ slot = list_entry(vma_slot_del.next,
++ struct vma_slot, slot_list);
++ list_del(&slot->slot_list);
++ cleanup_slots[i++] = slot;
++ if (i == SPIN_LOCK_PERIOD) {
++ spin_unlock(&vma_slot_list_lock);
++ while (--i >= 0)
++ uksm_del_vma_slot(cleanup_slots[i]);
++ i = 0;
++ spin_lock(&vma_slot_list_lock);
++ }
++ }
++ spin_unlock(&vma_slot_list_lock);
++
++ while (--i >= 0)
++ uksm_del_vma_slot(cleanup_slots[i]);
++}
++
++/*
++ * Expotional moving average formula
++ */
++static inline unsigned long ema(unsigned long curr, unsigned long last_ema)
++{
++ /*
++ * For a very high burst, even the ema cannot work well, a false very
++ * high per-page time estimation can result in feedback in very high
++ * overhead of context switch and rung update -- this will then lead
++ * to higher per-paper time, this may not converge.
++ *
++ * Instead, we try to approach this value in a binary manner.
++ */
++ if (curr > last_ema * 10)
++ return last_ema * 2;
++
++ return (EMA_ALPHA * curr + (100 - EMA_ALPHA) * last_ema) / 100;
++}
++
++/*
++ * convert cpu ratio in 1/TIME_RATIO_SCALE configured by user to
++ * nanoseconds based on current uksm_sleep_jiffies.
++ */
++static inline unsigned long cpu_ratio_to_nsec(unsigned int ratio)
++{
++ return NSEC_PER_USEC * jiffies_to_usecs(uksm_sleep_jiffies) /
++ (TIME_RATIO_SCALE - ratio) * ratio;
++}
++
++
++static inline unsigned long rung_real_ratio(int cpu_time_ratio)
++{
++ unsigned long ret;
++
++ BUG_ON(!cpu_time_ratio);
++
++ if (cpu_time_ratio > 0)
++ ret = cpu_time_ratio;
++ else
++ ret = (unsigned long)(-cpu_time_ratio) *
++ uksm_max_cpu_percentage / 100UL;
++
++ return ret ? ret : 1;
++}
++
++static noinline void uksm_calc_scan_pages(void)
++{
++ struct scan_rung *ladder = uksm_scan_ladder;
++ unsigned long sleep_usecs, nsecs;
++ unsigned long ratio;
++ int i;
++ unsigned long per_page;
++
++ if (uksm_ema_page_time > 100000 ||
++ (((unsigned long) uksm_eval_round & (256UL - 1)) == 0UL))
++ uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;
++
++ per_page = uksm_ema_page_time;
++ BUG_ON(!per_page);
++
++ /*
++ * For every 8 eval round, we try to probe a uksm_sleep_jiffies value
++ * based on saved user input.
++ */
++ if (((unsigned long) uksm_eval_round & (8UL - 1)) == 0UL)
++ uksm_sleep_jiffies = uksm_sleep_saved;
++
++ /* We require a rung scan at least 1 page in a period. */
++ nsecs = per_page;
++ ratio = rung_real_ratio(ladder[0].cpu_ratio);
++ if (cpu_ratio_to_nsec(ratio) < nsecs) {
++ sleep_usecs = nsecs * (TIME_RATIO_SCALE - ratio) / ratio
++ / NSEC_PER_USEC;
++ uksm_sleep_jiffies = usecs_to_jiffies(sleep_usecs) + 1;
++ }
++
++ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
++ ratio = rung_real_ratio(ladder[i].cpu_ratio);
++ ladder[i].pages_to_scan = cpu_ratio_to_nsec(ratio) /
++ per_page;
++ BUG_ON(!ladder[i].pages_to_scan);
++ uksm_calc_rung_step(&ladder[i], per_page, ratio);
++ }
++}
++
++/*
++ * From the scan time of this round (ns) to next expected min sleep time
++ * (ms), be careful of the possible overflows. ratio is taken from
++ * rung_real_ratio()
++ */
++static inline
++unsigned int scan_time_to_sleep(unsigned long long scan_time, unsigned long ratio)
++{
++ scan_time >>= 20; /* to msec level now */
++ BUG_ON(scan_time > (ULONG_MAX / TIME_RATIO_SCALE));
++
++ return (unsigned int) ((unsigned long) scan_time *
++ (TIME_RATIO_SCALE - ratio) / ratio);
++}
++
++#define __round_mask(x, y) ((__typeof__(x))((y)-1))
++#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
++
++static void uksm_vma_enter(struct vma_slot **slots, unsigned long num)
++{
++ struct scan_rung *rung;
++
++ rung = &uksm_scan_ladder[0];
++ rung_add_new_slots(rung, slots, num);
++}
++
++static struct vma_slot *batch_slots[SLOT_TREE_NODE_STORE_SIZE];
++
++static void uksm_enter_all_slots(void)
++{
++ struct vma_slot *slot;
++ unsigned long index;
++ struct list_head empty_vma_list;
++ int i;
++
++ i = 0;
++ index = 0;
++ INIT_LIST_HEAD(&empty_vma_list);
++
++ spin_lock(&vma_slot_list_lock);
++ while (!list_empty(&vma_slot_new)) {
++ slot = list_entry(vma_slot_new.next,
++ struct vma_slot, slot_list);
++
++ if (!slot->vma->anon_vma) {
++ list_move(&slot->slot_list, &empty_vma_list);
++ } else if (vma_can_enter(slot->vma)) {
++ batch_slots[index++] = slot;
++ list_del_init(&slot->slot_list);
++ } else {
++ list_move(&slot->slot_list, &vma_slot_noadd);
++ }
++
++ if (++i == SPIN_LOCK_PERIOD ||
++ (index && !(index % SLOT_TREE_NODE_STORE_SIZE))) {
++ spin_unlock(&vma_slot_list_lock);
++
++ if (index && !(index % SLOT_TREE_NODE_STORE_SIZE)) {
++ uksm_vma_enter(batch_slots, index);
++ index = 0;
++ }
++ i = 0;
++ cond_resched();
++ spin_lock(&vma_slot_list_lock);
++ }
++ }
++
++ list_splice(&empty_vma_list, &vma_slot_new);
++
++ spin_unlock(&vma_slot_list_lock);
++
++ if (index)
++ uksm_vma_enter(batch_slots, index);
++
++}
++
++static inline int rung_round_finished(struct scan_rung *rung)
++{
++ return rung->flags & UKSM_RUNG_ROUND_FINISHED;
++}
++
++static inline void judge_slot(struct vma_slot *slot)
++{
++ struct scan_rung *rung = slot->rung;
++ unsigned long dedup;
++ int deleted;
++
++ dedup = cal_dedup_ratio(slot);
++ if (vma_fully_scanned(slot) && uksm_thrash_threshold)
++ deleted = vma_rung_enter(slot, &uksm_scan_ladder[0]);
++ else if (dedup && dedup >= uksm_abundant_threshold)
++ deleted = vma_rung_up(slot);
++ else
++ deleted = vma_rung_down(slot);
++
++ slot->pages_merged = 0;
++ slot->pages_cowed = 0;
++ slot->this_sampled = 0;
++
++ if (vma_fully_scanned(slot))
++ slot->pages_scanned = 0;
++
++ slot->last_scanned = slot->pages_scanned;
++
++ /* If its deleted in above, then rung was already advanced. */
++ if (!deleted)
++ advance_current_scan(rung);
++}
++
++
++static inline int hash_round_finished(void)
++{
++ if (scanned_virtual_pages > (uksm_pages_total >> 2)) {
++ scanned_virtual_pages = 0;
++ if (uksm_pages_scanned)
++ fully_scanned_round++;
++
++ return 1;
++ } else {
++ return 0;
++ }
++}
++
++#define UKSM_MMSEM_BATCH 5
++#define BUSY_RETRY 100
++
++/**
++ * uksm_do_scan() - the main worker function.
++ */
++static noinline void uksm_do_scan(void)
++{
++ struct vma_slot *slot, *iter;
++ struct mm_struct *busy_mm;
++ unsigned char round_finished, all_rungs_emtpy;
++ int i, err, mmsem_batch;
++ unsigned long pcost;
++ long long delta_exec;
++ unsigned long vpages, max_cpu_ratio;
++ unsigned long long start_time, end_time, scan_time;
++ unsigned int expected_jiffies;
++
++ might_sleep();
++
++ vpages = 0;
++
++ start_time = task_sched_runtime(current);
++ max_cpu_ratio = 0;
++ mmsem_batch = 0;
++
++ for (i = 0; i < SCAN_LADDER_SIZE;) {
++ struct scan_rung *rung = &uksm_scan_ladder[i];
++ unsigned long ratio;
++ int busy_retry;
++
++ if (!rung->pages_to_scan) {
++ i++;
++ continue;
++ }
++
++ if (!rung->vma_root.num) {
++ rung->pages_to_scan = 0;
++ i++;
++ continue;
++ }
++
++ ratio = rung_real_ratio(rung->cpu_ratio);
++ if (ratio > max_cpu_ratio)
++ max_cpu_ratio = ratio;
++
++ busy_retry = BUSY_RETRY;
++ /*
++ * Do not consider rung_round_finished() here, just used up the
++ * rung->pages_to_scan quota.
++ */
++ while (rung->pages_to_scan && rung->vma_root.num &&
++ likely(!freezing(current))) {
++ int reset = 0;
++
++ slot = rung->current_scan;
++
++ BUG_ON(vma_fully_scanned(slot));
++
++ if (mmsem_batch)
++ err = 0;
++ else
++ err = try_down_read_slot_mmap_sem(slot);
++
++ if (err == -ENOENT) {
++rm_slot:
++ rung_rm_slot(slot);
++ continue;
++ }
++
++ busy_mm = slot->mm;
++
++ if (err == -EBUSY) {
++ /* skip other vmas on the same mm */
++ do {
++ reset = advance_current_scan(rung);
++ iter = rung->current_scan;
++ busy_retry--;
++ if (iter->vma->vm_mm != busy_mm ||
++ !busy_retry || reset)
++ break;
++ } while (1);
++
++ if (iter->vma->vm_mm != busy_mm) {
++ continue;
++ } else {
++ /* scan round finsished */
++ break;
++ }
++ }
++
++ BUG_ON(!vma_can_enter(slot->vma));
++ if (uksm_test_exit(slot->vma->vm_mm)) {
++ mmsem_batch = 0;
++ up_read(&slot->vma->vm_mm->mmap_sem);
++ goto rm_slot;
++ }
++
++ if (mmsem_batch)
++ mmsem_batch--;
++ else
++ mmsem_batch = UKSM_MMSEM_BATCH;
++
++ /* Ok, we have take the mmap_sem, ready to scan */
++ scan_vma_one_page(slot);
++ rung->pages_to_scan--;
++ vpages++;
++
++ if (rung->current_offset + rung->step > slot->pages - 1
++ || vma_fully_scanned(slot)) {
++ up_read(&slot->vma->vm_mm->mmap_sem);
++ judge_slot(slot);
++ mmsem_batch = 0;
++ } else {
++ rung->current_offset += rung->step;
++ if (!mmsem_batch)
++ up_read(&slot->vma->vm_mm->mmap_sem);
++ }
++
++ busy_retry = BUSY_RETRY;
++ cond_resched();
++ }
++
++ if (mmsem_batch) {
++ up_read(&slot->vma->vm_mm->mmap_sem);
++ mmsem_batch = 0;
++ }
++
++ if (freezing(current))
++ break;
++
++ cond_resched();
++ }
++ end_time = task_sched_runtime(current);
++ delta_exec = end_time - start_time;
++
++ if (freezing(current))
++ return;
++
++ cleanup_vma_slots();
++ uksm_enter_all_slots();
++
++ round_finished = 1;
++ all_rungs_emtpy = 1;
++ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
++ struct scan_rung *rung = &uksm_scan_ladder[i];
++
++ if (rung->vma_root.num) {
++ all_rungs_emtpy = 0;
++ if (!rung_round_finished(rung))
++ round_finished = 0;
++ }
++ }
++
++ if (all_rungs_emtpy)
++ round_finished = 0;
++
++ if (round_finished) {
++ round_update_ladder();
++ uksm_eval_round++;
++
++ if (hash_round_finished() && rshash_adjust()) {
++ /* Reset the unstable root iff hash strength changed */
++ uksm_hash_round++;
++ root_unstable_tree = RB_ROOT;
++ free_all_tree_nodes(&unstable_tree_node_list);
++ }
++
++ /*
++ * A number of pages can hang around indefinitely on per-cpu
++ * pagevecs, raised page count preventing write_protect_page
++ * from merging them. Though it doesn't really matter much,
++ * it is puzzling to see some stuck in pages_volatile until
++ * other activity jostles them out, and they also prevented
++ * LTP's KSM test from succeeding deterministically; so drain
++ * them here (here rather than on entry to uksm_do_scan(),
++ * so we don't IPI too often when pages_to_scan is set low).
++ */
++ lru_add_drain_all();
++ }
++
++
++ if (vpages && delta_exec > 0) {
++ pcost = (unsigned long) delta_exec / vpages;
++ if (likely(uksm_ema_page_time))
++ uksm_ema_page_time = ema(pcost, uksm_ema_page_time);
++ else
++ uksm_ema_page_time = pcost;
++ }
++
++ uksm_calc_scan_pages();
++ uksm_sleep_real = uksm_sleep_jiffies;
++ /* in case of radical cpu bursts, apply the upper bound */
++ end_time = task_sched_runtime(current);
++ if (max_cpu_ratio && end_time > start_time) {
++ scan_time = end_time - start_time;
++ expected_jiffies = msecs_to_jiffies(
++ scan_time_to_sleep(scan_time, max_cpu_ratio));
++
++ if (expected_jiffies > uksm_sleep_real)
++ uksm_sleep_real = expected_jiffies;
++
++ /* We have a 1 second up bound for responsiveness. */
++ if (jiffies_to_msecs(uksm_sleep_real) > MSEC_PER_SEC)
++ uksm_sleep_real = msecs_to_jiffies(1000);
++ }
++
++ return;
++}
++
++static int ksmd_should_run(void)
++{
++ return uksm_run & UKSM_RUN_MERGE;
++}
++
++static int uksm_scan_thread(void *nothing)
++{
++ set_freezable();
++ set_user_nice(current, 5);
++
++ while (!kthread_should_stop()) {
++ mutex_lock(&uksm_thread_mutex);
++ if (ksmd_should_run())
++ uksm_do_scan();
++ mutex_unlock(&uksm_thread_mutex);
++
++ try_to_freeze();
++
++ if (ksmd_should_run()) {
++ schedule_timeout_interruptible(uksm_sleep_real);
++ uksm_sleep_times++;
++ } else {
++ wait_event_freezable(uksm_thread_wait,
++ ksmd_should_run() || kthread_should_stop());
++ }
++ }
++ return 0;
++}
++
++void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
++{
++ struct stable_node *stable_node;
++ struct node_vma *node_vma;
++ struct rmap_item *rmap_item;
++ int search_new_forks = 0;
++ unsigned long address;
++
++ VM_BUG_ON_PAGE(!PageKsm(page), page);
++ VM_BUG_ON_PAGE(!PageLocked(page), page);
++
++ stable_node = page_stable_node(page);
++ if (!stable_node)
++ return;
++again:
++ hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) {
++ hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) {
++ struct anon_vma *anon_vma = rmap_item->anon_vma;
++ struct anon_vma_chain *vmac;
++ struct vm_area_struct *vma;
++
++ cond_resched();
++ anon_vma_lock_read(anon_vma);
++ anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
++ 0, ULONG_MAX) {
++ cond_resched();
++ vma = vmac->vma;
++ address = get_rmap_addr(rmap_item);
++
++ if (address < vma->vm_start ||
++ address >= vma->vm_end)
++ continue;
++
++ if ((rmap_item->slot->vma == vma) ==
++ search_new_forks)
++ continue;
++
++ if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
++ continue;
++
++ if (!rwc->rmap_one(page, vma, address, rwc->arg)) {
++ anon_vma_unlock_read(anon_vma);
++ return;
++ }
++
++ if (rwc->done && rwc->done(page)) {
++ anon_vma_unlock_read(anon_vma);
++ return;
++ }
++ }
++ anon_vma_unlock_read(anon_vma);
++ }
++ }
++ if (!search_new_forks++)
++ goto again;
++}
++
++#ifdef CONFIG_MIGRATION
++/* Common ksm interface but may be specific to uksm */
++void ksm_migrate_page(struct page *newpage, struct page *oldpage)
++{
++ struct stable_node *stable_node;
++
++ VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
++ VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
++ VM_BUG_ON(newpage->mapping != oldpage->mapping);
++
++ stable_node = page_stable_node(newpage);
++ if (stable_node) {
++ VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
++ stable_node->kpfn = page_to_pfn(newpage);
++ /*
++ * newpage->mapping was set in advance; now we need smp_wmb()
++ * to make sure that the new stable_node->kpfn is visible
++ * to get_ksm_page() before it can see that oldpage->mapping
++ * has gone stale (or that PageSwapCache has been cleared).
++ */
++ smp_wmb();
++ set_page_stable_node(oldpage, NULL);
++ }
++}
++#endif /* CONFIG_MIGRATION */
++
++#ifdef CONFIG_MEMORY_HOTREMOVE
++static struct stable_node *uksm_check_stable_tree(unsigned long start_pfn,
++ unsigned long end_pfn)
++{
++ struct rb_node *node;
++
++ for (node = rb_first(root_stable_treep); node; node = rb_next(node)) {
++ struct stable_node *stable_node;
++
++ stable_node = rb_entry(node, struct stable_node, node);
++ if (stable_node->kpfn >= start_pfn &&
++ stable_node->kpfn < end_pfn)
++ return stable_node;
++ }
++ return NULL;
++}
++
++static int uksm_memory_callback(struct notifier_block *self,
++ unsigned long action, void *arg)
++{
++ struct memory_notify *mn = arg;
++ struct stable_node *stable_node;
++
++ switch (action) {
++ case MEM_GOING_OFFLINE:
++ /*
++ * Keep it very simple for now: just lock out ksmd and
++ * MADV_UNMERGEABLE while any memory is going offline.
++ * mutex_lock_nested() is necessary because lockdep was alarmed
++ * that here we take uksm_thread_mutex inside notifier chain
++ * mutex, and later take notifier chain mutex inside
++ * uksm_thread_mutex to unlock it. But that's safe because both
++ * are inside mem_hotplug_mutex.
++ */
++ mutex_lock_nested(&uksm_thread_mutex, SINGLE_DEPTH_NESTING);
++ break;
++
++ case MEM_OFFLINE:
++ /*
++ * Most of the work is done by page migration; but there might
++ * be a few stable_nodes left over, still pointing to struct
++ * pages which have been offlined: prune those from the tree.
++ */
++ while ((stable_node = uksm_check_stable_tree(mn->start_pfn,
++ mn->start_pfn + mn->nr_pages)) != NULL)
++ remove_node_from_stable_tree(stable_node, 1, 1);
++ /* fallthrough */
++
++ case MEM_CANCEL_OFFLINE:
++ mutex_unlock(&uksm_thread_mutex);
++ break;
++ }
++ return NOTIFY_OK;
++}
++#endif /* CONFIG_MEMORY_HOTREMOVE */
++
++#ifdef CONFIG_SYSFS
++/*
++ * This all compiles without CONFIG_SYSFS, but is a waste of space.
++ */
++
++#define UKSM_ATTR_RO(_name) \
++ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
++#define UKSM_ATTR(_name) \
++ static struct kobj_attribute _name##_attr = \
++ __ATTR(_name, 0644, _name##_show, _name##_store)
++
++static ssize_t max_cpu_percentage_show(struct kobject *kobj,
++ struct kobj_attribute *attr, char *buf)
++{
++ return sprintf(buf, "%u\n", uksm_max_cpu_percentage);
++}
++
++static ssize_t max_cpu_percentage_store(struct kobject *kobj,
++ struct kobj_attribute *attr,
++ const char *buf, size_t count)
++{
++ unsigned long max_cpu_percentage;
++ int err;
++
++ err = kstrtoul(buf, 10, &max_cpu_percentage);
++ if (err || max_cpu_percentage > 100)
++ return -EINVAL;
++
++ if (max_cpu_percentage == 100)
++ max_cpu_percentage = 99;
++ else if (max_cpu_percentage < 10)
++ max_cpu_percentage = 10;
++
++ uksm_max_cpu_percentage = max_cpu_percentage;
++
++ return count;
++}
++UKSM_ATTR(max_cpu_percentage);
++
++static ssize_t sleep_millisecs_show(struct kobject *kobj,
++ struct kobj_attribute *attr, char *buf)
++{
++ return sprintf(buf, "%u\n", jiffies_to_msecs(uksm_sleep_jiffies));
++}
++
++static ssize_t sleep_millisecs_store(struct kobject *kobj,
++ struct kobj_attribute *attr,
++ const char *buf, size_t count)
++{
++ unsigned long msecs;
++ int err;
++
++ err = kstrtoul(buf, 10, &msecs);
++ if (err || msecs > MSEC_PER_SEC)
++ return -EINVAL;
++
++ uksm_sleep_jiffies = msecs_to_jiffies(msecs);
++ uksm_sleep_saved = uksm_sleep_jiffies;
++
++ return count;
++}
++UKSM_ATTR(sleep_millisecs);
++
++
++static ssize_t cpu_governor_show(struct kobject *kobj,
++ struct kobj_attribute *attr, char *buf)
++{
++ int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);
++ int i;
++
++ buf[0] = '\0';
++ for (i = 0; i < n ; i++) {
++ if (uksm_cpu_governor == i)
++ strcat(buf, "[");
++
++ strcat(buf, uksm_cpu_governor_str[i]);
++
++ if (uksm_cpu_governor == i)
++ strcat(buf, "]");
++
++ strcat(buf, " ");
++ }
++ strcat(buf, "\n");
++
++ return strlen(buf);
++}
++
++static inline void init_performance_values(void)
++{
++ int i;
++ struct scan_rung *rung;
++ struct uksm_cpu_preset_s *preset = uksm_cpu_preset + uksm_cpu_governor;
++
++
++ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
++ rung = uksm_scan_ladder + i;
++ rung->cpu_ratio = preset->cpu_ratio[i];
++ rung->cover_msecs = preset->cover_msecs[i];
++ }
++
++ uksm_max_cpu_percentage = preset->max_cpu;
++}
++
++static ssize_t cpu_governor_store(struct kobject *kobj,
++ struct kobj_attribute *attr,
++ const char *buf, size_t count)
++{
++ int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);
++
++ for (n--; n >= 0 ; n--) {
++ if (!strncmp(buf, uksm_cpu_governor_str[n],
++ strlen(uksm_cpu_governor_str[n])))
++ break;
++ }
++
++ if (n < 0)
++ return -EINVAL;
++ else
++ uksm_cpu_governor = n;
++
++ init_performance_values();
++
++ return count;
++}
++UKSM_ATTR(cpu_governor);
++
++static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
++ char *buf)
++{
++ return sprintf(buf, "%u\n", uksm_run);
++}
++
++static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
++ const char *buf, size_t count)
++{
++ int err;
++ unsigned long flags;
++
++ err = kstrtoul(buf, 10, &flags);
++ if (err || flags > UINT_MAX)
++ return -EINVAL;
++ if (flags > UKSM_RUN_MERGE)
++ return -EINVAL;
++
++ mutex_lock(&uksm_thread_mutex);
++ if (uksm_run != flags)
++ uksm_run = flags;
++ mutex_unlock(&uksm_thread_mutex);
++
++ if (flags & UKSM_RUN_MERGE)
++ wake_up_interruptible(&uksm_thread_wait);
++
++ return count;
++}
++UKSM_ATTR(run);
++
++static ssize_t abundant_threshold_show(struct kobject *kobj,
++ struct kobj_attribute *attr, char *buf)
++{
++ return sprintf(buf, "%u\n", uksm_abundant_threshold);
++}
++
++static ssize_t abundant_threshold_store(struct kobject *kobj,
++ struct kobj_attribute *attr,
++ const char *buf, size_t count)
++{
++ int err;
++ unsigned long flags;
++
++ err = kstrtoul(buf, 10, &flags);
++ if (err || flags > 99)
++ return -EINVAL;
++
++ uksm_abundant_threshold = flags;
++
++ return count;
++}
++UKSM_ATTR(abundant_threshold);
++
++static ssize_t thrash_threshold_show(struct kobject *kobj,
++ struct kobj_attribute *attr, char *buf)
++{
++ return sprintf(buf, "%u\n", uksm_thrash_threshold);
++}
++
++static ssize_t thrash_threshold_store(struct kobject *kobj,
++ struct kobj_attribute *attr,
++ const char *buf, size_t count)
++{
++ int err;
++ unsigned long flags;
++
++ err = kstrtoul(buf, 10, &flags);
++ if (err || flags > 99)
++ return -EINVAL;
++
++ uksm_thrash_threshold = flags;
++
++ return count;
++}
++UKSM_ATTR(thrash_threshold);
++
++static ssize_t cpu_ratios_show(struct kobject *kobj,
++ struct kobj_attribute *attr, char *buf)
++{
++ int i, size;
++ struct scan_rung *rung;
++ char *p = buf;
++
++ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
++ rung = &uksm_scan_ladder[i];
++
++ if (rung->cpu_ratio > 0)
++ size = sprintf(p, "%d ", rung->cpu_ratio);
++ else
++ size = sprintf(p, "MAX/%d ",
++ TIME_RATIO_SCALE / -rung->cpu_ratio);
++
++ p += size;
++ }
++
++ *p++ = '\n';
++ *p = '\0';
++
++ return p - buf;
++}
++
++static ssize_t cpu_ratios_store(struct kobject *kobj,
++ struct kobj_attribute *attr,
++ const char *buf, size_t count)
++{
++ int i, cpuratios[SCAN_LADDER_SIZE], err;
++ unsigned long value;
++ struct scan_rung *rung;
++ char *p, *end = NULL;
++
++ p = kzalloc(count, GFP_KERNEL);
++ if (!p)
++ return -ENOMEM;
++
++ memcpy(p, buf, count);
++
++ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
++ if (i != SCAN_LADDER_SIZE - 1) {
++ end = strchr(p, ' ');
++ if (!end)
++ return -EINVAL;
++
++ *end = '\0';
++ }
++
++ if (strstr(p, "MAX/")) {
++ p = strchr(p, '/') + 1;
++ err = kstrtoul(p, 10, &value);
++ if (err || value > TIME_RATIO_SCALE || !value)
++ return -EINVAL;
++
++ cpuratios[i] = -(int) (TIME_RATIO_SCALE / value);
++ } else {
++ err = kstrtoul(p, 10, &value);
++ if (err || value > TIME_RATIO_SCALE || !value)
++ return -EINVAL;
++
++ cpuratios[i] = value;
++ }
++
++ p = end + 1;
++ }
++
++ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
++ rung = &uksm_scan_ladder[i];
++
++ rung->cpu_ratio = cpuratios[i];
++ }
++
++ return count;
++}
++UKSM_ATTR(cpu_ratios);
++
++static ssize_t eval_intervals_show(struct kobject *kobj,
++ struct kobj_attribute *attr, char *buf)
++{
++ int i, size;
++ struct scan_rung *rung;
++ char *p = buf;
++
++ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
++ rung = &uksm_scan_ladder[i];
++ size = sprintf(p, "%u ", rung->cover_msecs);
++ p += size;
++ }
++
++ *p++ = '\n';
++ *p = '\0';
++
++ return p - buf;
++}
++
++static ssize_t eval_intervals_store(struct kobject *kobj,
++ struct kobj_attribute *attr,
++ const char *buf, size_t count)
++{
++ int i, err;
++ unsigned long values[SCAN_LADDER_SIZE];
++ struct scan_rung *rung;
++ char *p, *end = NULL;
++ ssize_t ret = count;
++
++ p = kzalloc(count + 2, GFP_KERNEL);
++ if (!p)
++ return -ENOMEM;
++
++ memcpy(p, buf, count);
++
++ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
++ if (i != SCAN_LADDER_SIZE - 1) {
++ end = strchr(p, ' ');
++ if (!end) {
++ ret = -EINVAL;
++ goto out;
++ }
++
++ *end = '\0';
++ }
++
++ err = kstrtoul(p, 10, &values[i]);
++ if (err) {
++ ret = -EINVAL;
++ goto out;
++ }
++
++ p = end + 1;
++ }
++
++ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
++ rung = &uksm_scan_ladder[i];
++
++ rung->cover_msecs = values[i];
++ }
++
++out:
++ kfree(p);
++ return ret;
++}
++UKSM_ATTR(eval_intervals);
++
++static ssize_t ema_per_page_time_show(struct kobject *kobj,
++ struct kobj_attribute *attr, char *buf)
++{
++ return sprintf(buf, "%lu\n", uksm_ema_page_time);
++}
++UKSM_ATTR_RO(ema_per_page_time);
++
++static ssize_t pages_shared_show(struct kobject *kobj,
++ struct kobj_attribute *attr, char *buf)
++{
++ return sprintf(buf, "%lu\n", uksm_pages_shared);
++}
++UKSM_ATTR_RO(pages_shared);
++
++static ssize_t pages_sharing_show(struct kobject *kobj,
++ struct kobj_attribute *attr, char *buf)
++{
++ return sprintf(buf, "%lu\n", uksm_pages_sharing);
++}
++UKSM_ATTR_RO(pages_sharing);
++
++static ssize_t pages_unshared_show(struct kobject *kobj,
++ struct kobj_attribute *attr, char *buf)
++{
++ return sprintf(buf, "%lu\n", uksm_pages_unshared);
++}
++UKSM_ATTR_RO(pages_unshared);
++
++static ssize_t full_scans_show(struct kobject *kobj,
++ struct kobj_attribute *attr, char *buf)
++{
++ return sprintf(buf, "%llu\n", fully_scanned_round);
++}
++UKSM_ATTR_RO(full_scans);
++
++static ssize_t pages_scanned_show(struct kobject *kobj,
++ struct kobj_attribute *attr, char *buf)
++{
++ unsigned long base = 0;
++ u64 delta, ret;
++
++ if (pages_scanned_stored) {
++ base = pages_scanned_base;
++ ret = pages_scanned_stored;
++ delta = uksm_pages_scanned >> base;
++ if (CAN_OVERFLOW_U64(ret, delta)) {
++ ret >>= 1;
++ delta >>= 1;
++ base++;
++ ret += delta;
++ }
++ } else {
++ ret = uksm_pages_scanned;
++ }
++
++ while (ret > ULONG_MAX) {
++ ret >>= 1;
++ base++;
++ }
++
++ if (base)
++ return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base);
++ else
++ return sprintf(buf, "%lu\n", (unsigned long)ret);
++}
++UKSM_ATTR_RO(pages_scanned);
++
++static ssize_t hash_strength_show(struct kobject *kobj,
++ struct kobj_attribute *attr, char *buf)
++{
++ return sprintf(buf, "%lu\n", hash_strength);
++}
++UKSM_ATTR_RO(hash_strength);
++
++static ssize_t sleep_times_show(struct kobject *kobj,
++ struct kobj_attribute *attr, char *buf)
++{
++ return sprintf(buf, "%llu\n", uksm_sleep_times);
++}
++UKSM_ATTR_RO(sleep_times);
++
++
++static struct attribute *uksm_attrs[] = {
++ &max_cpu_percentage_attr.attr,
++ &sleep_millisecs_attr.attr,
++ &cpu_governor_attr.attr,
++ &run_attr.attr,
++ &ema_per_page_time_attr.attr,
++ &pages_shared_attr.attr,
++ &pages_sharing_attr.attr,
++ &pages_unshared_attr.attr,
++ &full_scans_attr.attr,
++ &pages_scanned_attr.attr,
++ &hash_strength_attr.attr,
++ &sleep_times_attr.attr,
++ &thrash_threshold_attr.attr,
++ &abundant_threshold_attr.attr,
++ &cpu_ratios_attr.attr,
++ &eval_intervals_attr.attr,
++ NULL,
++};
++
++static struct attribute_group uksm_attr_group = {
++ .attrs = uksm_attrs,
++ .name = "uksm",
++};
++#endif /* CONFIG_SYSFS */
++
++static inline void init_scan_ladder(void)
++{
++ int i;
++ struct scan_rung *rung;
++
++ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
++ rung = uksm_scan_ladder + i;
++ slot_tree_init_root(&rung->vma_root);
++ }
++
++ init_performance_values();
++ uksm_calc_scan_pages();
++}
++
++static inline int cal_positive_negative_costs(void)
++{
++ struct page *p1, *p2;
++ unsigned char *addr1, *addr2;
++ unsigned long i, time_start, hash_cost;
++ unsigned long loopnum = 0;
++
++ /*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */
++ volatile u32 hash;
++ volatile int ret;
++
++ p1 = alloc_page(GFP_KERNEL);
++ if (!p1)
++ return -ENOMEM;
++
++ p2 = alloc_page(GFP_KERNEL);
++ if (!p2)
++ return -ENOMEM;
++
++ addr1 = kmap_atomic(p1);
++ addr2 = kmap_atomic(p2);
++ memset(addr1, prandom_u32(), PAGE_SIZE);
++ memcpy(addr2, addr1, PAGE_SIZE);
++
++ /* make sure that the two pages differ in last byte */
++ addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1];
++ kunmap_atomic(addr2);
++ kunmap_atomic(addr1);
++
++ time_start = jiffies;
++ while (jiffies - time_start < 100) {
++ for (i = 0; i < 100; i++)
++ hash = page_hash(p1, HASH_STRENGTH_FULL, 0);
++ loopnum += 100;
++ }
++ hash_cost = (jiffies - time_start);
++
++ time_start = jiffies;
++ for (i = 0; i < loopnum; i++)
++ ret = pages_identical_with_cost(p1, p2);
++ memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start);
++ memcmp_cost /= hash_cost;
++ pr_info("UKSM: relative memcmp_cost = %lu "
++ "hash=%u cmp_ret=%d.\n",
++ memcmp_cost, hash, ret);
++
++ __free_page(p1);
++ __free_page(p2);
++ return 0;
++}
++
++static int init_zeropage_hash_table(void)
++{
++ struct page *page;
++ char *addr;
++ int i;
++
++ page = alloc_page(GFP_KERNEL);
++ if (!page)
++ return -ENOMEM;
++
++ addr = kmap_atomic(page);
++ memset(addr, 0, PAGE_SIZE);
++ kunmap_atomic(addr);
++
++ zero_hash_table = kmalloc_array(HASH_STRENGTH_MAX, sizeof(u32),
++ GFP_KERNEL);
++ if (!zero_hash_table)
++ return -ENOMEM;
++
++ for (i = 0; i < HASH_STRENGTH_MAX; i++)
++ zero_hash_table[i] = page_hash(page, i, 0);
++
++ __free_page(page);
++
++ return 0;
++}
++
++static inline int init_random_sampling(void)
++{
++ unsigned long i;
++
++ random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL);
++ if (!random_nums)
++ return -ENOMEM;
++
++ for (i = 0; i < HASH_STRENGTH_FULL; i++)
++ random_nums[i] = i;
++
++ for (i = 0; i < HASH_STRENGTH_FULL; i++) {
++ unsigned long rand_range, swap_index, tmp;
++
++ rand_range = HASH_STRENGTH_FULL - i;
++ swap_index = i + prandom_u32() % rand_range;
++ tmp = random_nums[i];
++ random_nums[i] = random_nums[swap_index];
++ random_nums[swap_index] = tmp;
++ }
++
++ rshash_state.state = RSHASH_NEW;
++ rshash_state.below_count = 0;
++ rshash_state.lookup_window_index = 0;
++
++ return cal_positive_negative_costs();
++}
++
++static int __init uksm_slab_init(void)
++{
++ rmap_item_cache = UKSM_KMEM_CACHE(rmap_item, 0);
++ if (!rmap_item_cache)
++ goto out;
++
++ stable_node_cache = UKSM_KMEM_CACHE(stable_node, 0);
++ if (!stable_node_cache)
++ goto out_free1;
++
++ node_vma_cache = UKSM_KMEM_CACHE(node_vma, 0);
++ if (!node_vma_cache)
++ goto out_free2;
++
++ vma_slot_cache = UKSM_KMEM_CACHE(vma_slot, 0);
++ if (!vma_slot_cache)
++ goto out_free3;
++
++ tree_node_cache = UKSM_KMEM_CACHE(tree_node, 0);
++ if (!tree_node_cache)
++ goto out_free4;
++
++ return 0;
++
++out_free4:
++ kmem_cache_destroy(vma_slot_cache);
++out_free3:
++ kmem_cache_destroy(node_vma_cache);
++out_free2:
++ kmem_cache_destroy(stable_node_cache);
++out_free1:
++ kmem_cache_destroy(rmap_item_cache);
++out:
++ return -ENOMEM;
++}
++
++static void __init uksm_slab_free(void)
++{
++ kmem_cache_destroy(stable_node_cache);
++ kmem_cache_destroy(rmap_item_cache);
++ kmem_cache_destroy(node_vma_cache);
++ kmem_cache_destroy(vma_slot_cache);
++ kmem_cache_destroy(tree_node_cache);
++}
++
++/* Common interface to ksm, different to it. */
++int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
++ unsigned long end, int advice, unsigned long *vm_flags)
++{
++ int err;
++
++ switch (advice) {
++ case MADV_MERGEABLE:
++ return 0; /* just ignore the advice */
++
++ case MADV_UNMERGEABLE:
++ if (!(*vm_flags & VM_MERGEABLE) || !uksm_flags_can_scan(*vm_flags))
++ return 0; /* just ignore the advice */
++
++ if (vma->anon_vma) {
++ err = unmerge_uksm_pages(vma, start, end);
++ if (err)
++ return err;
++ }
++
++ uksm_remove_vma(vma);
++ *vm_flags &= ~VM_MERGEABLE;
++ break;
++ }
++
++ return 0;
++}
++
++/* Common interface to ksm, actually the same. */
++struct page *ksm_might_need_to_copy(struct page *page,
++ struct vm_area_struct *vma, unsigned long address)
++{
++ struct anon_vma *anon_vma = page_anon_vma(page);
++ struct page *new_page;
++
++ if (PageKsm(page)) {
++ if (page_stable_node(page))
++ return page; /* no need to copy it */
++ } else if (!anon_vma) {
++ return page; /* no need to copy it */
++ } else if (anon_vma->root == vma->anon_vma->root &&
++ page->index == linear_page_index(vma, address)) {
++ return page; /* still no need to copy it */
++ }
++ if (!PageUptodate(page))
++ return page; /* let do_swap_page report the error */
++
++ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
++ if (new_page) {
++ copy_user_highpage(new_page, page, address, vma);
++
++ SetPageDirty(new_page);
++ __SetPageUptodate(new_page);
++ __SetPageLocked(new_page);
++ }
++
++ return new_page;
++}
++
++/* Copied from mm/ksm.c and required from 5.1 */
++bool reuse_ksm_page(struct page *page,
++ struct vm_area_struct *vma,
++ unsigned long address)
++{
++#ifdef CONFIG_DEBUG_VM
++ if (WARN_ON(is_zero_pfn(page_to_pfn(page))) ||
++ WARN_ON(!page_mapped(page)) ||
++ WARN_ON(!PageLocked(page))) {
++ dump_page(page, "reuse_ksm_page");
++ return false;
++ }
++#endif
++
++ if (PageSwapCache(page) || !page_stable_node(page))
++ return false;
++ /* Prohibit parallel get_ksm_page() */
++ if (!page_ref_freeze(page, 1))
++ return false;
++
++ page_move_anon_rmap(page, vma);
++ page->index = linear_page_index(vma, address);
++ page_ref_unfreeze(page, 1);
++
++ return true;
++}
++
++static int __init uksm_init(void)
++{
++ struct task_struct *uksm_thread;
++ int err;
++
++ uksm_sleep_jiffies = msecs_to_jiffies(100);
++ uksm_sleep_saved = uksm_sleep_jiffies;
++
++ slot_tree_init();
++ init_scan_ladder();
++
++
++ err = init_random_sampling();
++ if (err)
++ goto out_free2;
++
++ err = uksm_slab_init();
++ if (err)
++ goto out_free1;
++
++ err = init_zeropage_hash_table();
++ if (err)
++ goto out_free0;
++
++ uksm_thread = kthread_run(uksm_scan_thread, NULL, "uksmd");
++ if (IS_ERR(uksm_thread)) {
++ pr_err("uksm: creating kthread failed\n");
++ err = PTR_ERR(uksm_thread);
++ goto out_free;
++ }
++
++#ifdef CONFIG_SYSFS
++ err = sysfs_create_group(mm_kobj, &uksm_attr_group);
++ if (err) {
++ pr_err("uksm: register sysfs failed\n");
++ kthread_stop(uksm_thread);
++ goto out_free;
++ }
++#else
++ uksm_run = UKSM_RUN_MERGE; /* no way for user to start it */
++
++#endif /* CONFIG_SYSFS */
++
++#ifdef CONFIG_MEMORY_HOTREMOVE
++ /*
++ * Choose a high priority since the callback takes uksm_thread_mutex:
++ * later callbacks could only be taking locks which nest within that.
++ */
++ hotplug_memory_notifier(uksm_memory_callback, 100);
++#endif
++ return 0;
++
++out_free:
++ kfree(zero_hash_table);
++out_free0:
++ uksm_slab_free();
++out_free1:
++ kfree(random_nums);
++out_free2:
++ kfree(uksm_scan_ladder);
++ return err;
++}
++
++#ifdef MODULE
++subsys_initcall(ksm_init);
++#else
++late_initcall(uksm_init);
++#endif
++
+diff --git a/mm/vmstat.c b/mm/vmstat.c
+index a8222041bd44..7058e8322cbd 100644
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -1168,6 +1168,9 @@ const char * const vmstat_text[] = {
+ "nr_written",
+ "nr_kernel_misc_reclaimable",
+
++#ifdef CONFIG_UKSM
++ "nr_uksm_zero_pages",
++#endif
+ /* enum writeback_stat_item counters */
+ "nr_dirty_threshold",
+ "nr_dirty_background_threshold",
diff --git a/README.md b/README.md
index 200d40975..0eb683747 100644
--- a/README.md
+++ b/README.md
@@ -22,26 +22,21 @@
### 特性
- 基于原生 OpenWrt 21.02 编译,默认管理地址192.168.1.1
-- 同时支持 SFE/Software Offload (选则其一开启,默认开启SFE)
+- 默认开启了 Software Offload
- 内置升级功能可用,物理 Reset 按键可用
- 预配置了部分插件(包括但不限于 DNS 套娃,(注意,6月29日开始取消了dns套娃,使用dnsfilter作为广告过滤手段,使用dnsproxy作为dns分流措施,海外端口5335,国内端口6050。)
-- 正式 Release 版本将具有可无脑 opkg kmod 的特性
+- 可无脑 opkg kmod
- R2S核心频率1.6(交换了LAN WAN),R4S核心频率2.2/1.8(建议使用5v4a电源,死机大多数情况下,都是因为你用的电源过于垃圾,另外,你也可以选择使用自带的app限制最大频率,茄子🍆)
-- O3 编译
-- 插件包含:SSRP,PassWall,OpenClash,AdguardHome,微信推送,网易云解锁,SQM,SmartDNS,ChinaDNS,网络唤醒,DDNS,迅雷快鸟,UPNP,FullCone(防火墙中开启,默认开启),流量分载,SFE流量分载,irq优化,京东签到,Zerotier,FRPC,FRPS,无线打印,流量监控,过滤军刀,R2S-OLED
+- O3 编译,CFLAG优化
+- 插件包含:SSRP,PassWall,OpenClash,AdguardHome,微信推送,网易云解锁,SQM,DNSProxy,网络唤醒,DDNS,迅雷快鸟,UPNP,FullCone(防火墙中开启,默认开启),流量分载,irq优化,京东签到,Zerotier,FRPC,FRPS,无线打印,流量监控,过滤军刀,R2S-OLED
- ss协议在armv8上实现了aes硬件加速(请仅使用aead加密的连接方式)
+- 集成并默认启用了UKSM,BBRv2,以及CacULE Scheduler
- 如有任何问题,请先尝试ssh进入后台,输入fuck后回车,等待机器重启后确认问题是否已经解决
### 下载
- 选择自己设备对应的固件,并[下载](https://github.com/QiuSimons/R2S-R4S-OpenWrt/releases)
-### 截图
-
-| 组件 | 流量分载 |
-| :----------------------------------------------------------: | :----------------------------------------------------------: |
-|  |  |
-
### 鸣谢
| [CTCGFW](https://github.com/immortalwrt) | [coolsnowwolf](https://github.com/coolsnowwolf) | [Lienol](https://github.com/Lienol) |
diff --git a/SCRIPTS/02_prepare_package.sh b/SCRIPTS/02_prepare_package.sh
index c08657d52..934c8bae3 100644
--- a/SCRIPTS/02_prepare_package.sh
+++ b/SCRIPTS/02_prepare_package.sh
@@ -17,7 +17,7 @@ rm -rf ./scripts/download.pl
rm -rf ./include/download.mk
wget -P scripts/ https://github.com/immortalwrt/immortalwrt/raw/openwrt-21.02/scripts/download.pl
wget -P include/ https://github.com/immortalwrt/immortalwrt/raw/openwrt-21.02/include/download.mk
-wget -P include/ https://github.com/immortalwrt/immortalwrt/raw/openwrt-21.02/include/package-immortalwrt.mk
+#wget -P include/ https://github.com/immortalwrt/immortalwrt/raw/openwrt-21.02/include/package-immortalwrt.mk
sed -i '/unshift/d' scripts/download.pl
sed -i '/mirror02/d' scripts/download.pl
echo "net.netfilter.nf_conntrack_helper = 1" >> ./package/kernel/linux/files/sysctl-nf-conntrack.conf
@@ -155,13 +155,13 @@ git clone -b luci --depth 1 https://github.com/pexcn/openwrt-chinadns-ng.git pac
svn co https://github.com/xiaorouji/openwrt-passwall/trunk/chinadns-ng package/new/chinadns-ng
# 内存压缩
#wget -O- https://patch-diff.githubusercontent.com/raw/openwrt/openwrt/pull/2840.patch | patch -p1
-wget -O- https://github.com/NoTengoBattery/openwrt/commit/40f1d5.patch | patch -p1
-wget -O- https://github.com/NoTengoBattery/openwrt/commit/a83a0b.patch | patch -p1
-wget -O- https://github.com/NoTengoBattery/openwrt/commit/6d5fb4.patch | patch -p1
-mkdir ./package/new
-cp -rf ../NoTengoBattery/feeds/luci/applications/luci-app-compressed-memory ./package/new/luci-app-compressed-memory
-sed -i 's,include ../..,include $(TOPDIR)/feeds/luci,g' ./package/new/luci-app-compressed-memory/Makefile
-cp -rf ../NoTengoBattery/package/system/compressed-memory ./package/system/compressed-memory
+#wget -O- https://github.com/NoTengoBattery/openwrt/commit/40f1d5.patch | patch -p1
+#wget -O- https://github.com/NoTengoBattery/openwrt/commit/a83a0b.patch | patch -p1
+#wget -O- https://github.com/NoTengoBattery/openwrt/commit/6d5fb4.patch | patch -p1
+#mkdir ./package/new
+#cp -rf ../NoTengoBattery/feeds/luci/applications/luci-app-compressed-memory ./package/new/luci-app-compressed-memory
+#sed -i 's,include ../..,include $(TOPDIR)/feeds/luci,g' ./package/new/luci-app-compressed-memory/Makefile
+#cp -rf ../NoTengoBattery/package/system/compressed-memory ./package/system/compressed-memory
# CPU 控制相关
svn co https://github.com/immortalwrt/luci/trunk/applications/luci-app-cpufreq feeds/luci/applications/luci-app-cpufreq
ln -sf ../../../feeds/luci/applications/luci-app-cpufreq ./package/feeds/luci/luci-app-cpufreq
@@ -176,11 +176,11 @@ ln -sf ../../../feeds/packages/utils/cpulimit ./package/feeds/packages/cpulimit
#rm -rf ./feeds/luci/applications/luci-app-ddns
#svn co https://github.com/coolsnowwolf/lede/trunk/package/lean/ddns-scripts_aliyun package/lean/ddns-scripts_aliyun
#svn co https://github.com/coolsnowwolf/lede/trunk/package/lean/ddns-scripts_dnspod package/lean/ddns-scripts_dnspod
+#svn co https://github.com/openwrt/packages/branches/openwrt-18.06/net/ddns-scripts feeds/packages/net/ddns-scripts
+#svn co https://github.com/openwrt/luci/branches/openwrt-18.06/applications/luci-app-ddns feeds/luci/applications/luci-app-ddns
git clone --depth 1 https://github.com/small-5/ddns-scripts-dnspod package/lean/ddns-scripts_dnspod
git clone --depth 1 https://github.com/small-5/ddns-scripts-aliyun package/lean/ddns-scripts_aliyun
-svn co https://github.com/openwrt/packages/branches/openwrt-18.06/net/ddns-scripts feeds/packages/net/ddns-scripts
-svn co https://github.com/openwrt/luci/branches/openwrt-18.06/applications/luci-app-ddns feeds/luci/applications/luci-app-ddns
-svn co https://github.com/QiuSimons/OpenWrt_luci-app/trunk/others/luci-app-tencentddns package/lean/luci-app-tencentddns
+svn co https://github.com/QiuSimons/OpenWrt_luci-app/trunk/luci-app-tencentddns package/lean/luci-app-tencentddns
svn co https://github.com/kenzok8/openwrt-packages/trunk/luci-app-aliddns feeds/luci/applications/luci-app-aliddns
ln -sf ../../../feeds/luci/applications/luci-app-aliddns ./package/feeds/luci/luci-app-aliddns
# Docker 容器(会导致 OpenWrt 出现 UDP 转发问题,慎用)
@@ -208,13 +208,13 @@ svn co https://github.com/coolsnowwolf/lede/trunk/package/lean/luci-app-ipsec-vp
svn co https://github.com/coolsnowwolf/lede/trunk/package/lean/ipv6-helper package/lean/ipv6-helper
# 京东签到 By Jerrykuku
git clone --depth 1 https://github.com/jerrykuku/node-request.git package/new/node-request
-svn co -r131 https://github.com/jerrykuku/luci-app-jd-dailybonus/trunk package/new/luci-app-jd-dailybonus
-pushd package/new/luci-app-jd-dailybonus
-sed -i 's,wget-ssl,wget,g' root/usr/share/jd-dailybonus/newapp.sh luasrc/controller/jd-dailybonus.lua
-sed -i 's,* sh,*,g' root/usr/share/jd-dailybonus/newapp.sh
-popd
-rm -rf ./package/new/luci-app-jd-dailybonus/root/usr/share/jd-dailybonus/JD_DailyBonus.js
-wget -P package/new/luci-app-jd-dailybonus/root/usr/share/jd-dailybonus/ https://github.com/NobyDa/Script/raw/master/JD-DailyBonus/JD_DailyBonus.js
+git clone --depth 1 https://github.com/jerrykuku/luci-app-jd-dailybonus.git package/new/luci-app-jd-dailybonus
+#pushd package/new/luci-app-jd-dailybonus
+#sed -i 's,wget-ssl,wget,g' root/usr/share/jd-dailybonus/newapp.sh luasrc/controller/jd-dailybonus.lua
+#sed -i 's,* sh,*,g' root/usr/share/jd-dailybonus/newapp.sh
+#popd
+#rm -rf ./package/new/luci-app-jd-dailybonus/root/usr/share/jd-dailybonus/JD_DailyBonus.js
+#wget -P package/new/luci-app-jd-dailybonus/root/usr/share/jd-dailybonus/ https://github.com/NobyDa/Script/raw/master/JD-DailyBonus/JD_DailyBonus.js
# 回滚通用即插即用
#rm -rf ./feeds/packages/net/miniupnpd
#svn co https://github.com/coolsnowwolf/packages/trunk/net/miniupnpd feeds/packages/net/miniupnpd
@@ -284,7 +284,6 @@ rm -rf ./feeds/luci/applications/luci-app-smartdns
svn co https://github.com/immortalwrt/luci/branches/openwrt-18.06/applications/luci-app-smartdns feeds/luci/applications/luci-app-smartdns
# ShadowsocksR Plus+ 依赖
rm -rf ./feeds/packages/net/kcptun
-rm -rf ./feeds/packages/net/proxychains-ng
rm -rf ./feeds/packages/net/shadowsocks-libev
rm -rf ./feeds/packages/net/xray-core
#svn co https://github.com/coolsnowwolf/lede/trunk/package/lean/shadowsocksr-libev package/lean/shadowsocksr-libev
@@ -295,7 +294,6 @@ svn co https://github.com/coolsnowwolf/lede/trunk/package/lean/srelay package/le
svn co https://github.com/coolsnowwolf/lede/trunk/package/lean/microsocks package/lean/microsocks
svn co https://github.com/coolsnowwolf/lede/trunk/package/lean/dns2socks package/lean/dns2socks
svn co https://github.com/coolsnowwolf/lede/trunk/package/lean/redsocks2 package/lean/redsocks2
-svn co https://github.com/coolsnowwolf/lede/trunk/package/lean/proxychains-ng package/lean/proxychains-ng
svn co https://github.com/coolsnowwolf/lede/trunk/package/lean/ipt2socks package/lean/ipt2socks
svn co https://github.com/coolsnowwolf/lede/trunk/package/lean/simple-obfs package/lean/simple-obfs
svn co https://github.com/coolsnowwolf/packages/trunk/net/shadowsocks-libev package/lean/shadowsocks-libev
diff --git a/SCRIPTS/R2S/02_target_only.sh b/SCRIPTS/R2S/02_target_only.sh
index a597c8dd7..20896c3bc 100644
--- a/SCRIPTS/R2S/02_target_only.sh
+++ b/SCRIPTS/R2S/02_target_only.sh
@@ -15,6 +15,12 @@ echo '
CONFIG_NR_CPUS=4
' >> ./target/linux/rockchip/armv8/config-5.4
+# UKSM
+echo '
+CONFIG_KSM=y
+CONFIG_UKSM=y
+' >> ./target/linux/rockchip/armv8/config-5.4
+
# 配置 IRQ 并默认关闭 eth0 offloading rx/rx
sed -i '/set_interface_core 4 "eth1"/a\set_interface_core 8 "ff160000" "ff160000.i2c"' target/linux/rockchip/armv8/base-files/etc/hotplug.d/net/40-net-smp-affinity
sed -i '/set_interface_core 4 "eth1"/a\set_interface_core 1 "ff150000" "ff150000.i2c"' target/linux/rockchip/armv8/base-files/etc/hotplug.d/net/40-net-smp-affinity
diff --git a/SCRIPTS/R4S/02_target_only.sh b/SCRIPTS/R4S/02_target_only.sh
index 440006295..01e686e04 100644
--- a/SCRIPTS/R4S/02_target_only.sh
+++ b/SCRIPTS/R4S/02_target_only.sh
@@ -10,20 +10,18 @@ sed -i 's,kmod-r8169,kmod-r8168,g' target/linux/rockchip/image/armv8.mk
wget -P target/linux/rockchip/armv8/base-files/etc/init.d/ https://github.com/friendlyarm/friendlywrt/raw/master-v19.07.1/target/linux/rockchip-rk3399/base-files/etc/init.d/fa-rk3399-pwmfan
wget -P target/linux/rockchip/armv8/base-files/usr/bin/ https://github.com/friendlyarm/friendlywrt/raw/master-v19.07.1/target/linux/rockchip-rk3399/base-files/usr/bin/start-rk3399-pwm-fan.sh
-# 测试性功能
-sed -i '/CRYPTO_DEV_ROCKCHIP/d' ./target/linux/rockchip/armv8/config-5.4
-sed -i '/HW_RANDOM_ROCKCHIP/d' ./target/linux/rockchip/armv8/config-5.4
-echo '
-CONFIG_CRYPTO_DEV_ROCKCHIP=y
-CONFIG_HW_RANDOM_ROCKCHIP=y
-' >> ./target/linux/rockchip/armv8/config-5.4
-
# CacULE
sed -i '/CONFIG_NR_CPUS/d' ./target/linux/rockchip/armv8/config-5.4
echo '
CONFIG_NR_CPUS=6
' >> ./target/linux/rockchip/armv8/config-5.4
+# UKSM
+echo '
+CONFIG_KSM=y
+CONFIG_UKSM=y
+' >> ./target/linux/rockchip/armv8/config-5.4
+
# IRQ 调优
sed -i '/set_interface_core 20 "eth1"/a\set_interface_core 8 "ff3c0000" "ff3c0000.i2c"' target/linux/rockchip/armv8/base-files/etc/hotplug.d/net/40-net-smp-affinity
sed -i '/set_interface_core 20 "eth1"/a\ethtool -C eth0 rx-usecs 1000 rx-frames 25 tx-usecs 100 tx-frames 25' target/linux/rockchip/armv8/base-files/etc/hotplug.d/net/40-net-smp-affinity
diff --git a/SCRIPTS/X86/02_target_only.sh b/SCRIPTS/X86/02_target_only.sh
index 5c0ff3183..a513cfa69 100644
--- a/SCRIPTS/X86/02_target_only.sh
+++ b/SCRIPTS/X86/02_target_only.sh
@@ -10,6 +10,12 @@ echo '
CONFIG_CRYPTO_AES_NI_INTEL=y
' >> ./target/linux/x86/64/config-5.4
+# UKSM
+echo '
+CONFIG_KSM=y
+CONFIG_UKSM=y
+' >> ./target/linux/x86/64/config-5.4
+
#Vermagic
latest_version="$(curl -s https://github.com/openwrt/openwrt/releases |grep -Eo "v[0-9\.]+\-*r*c*[0-9]*.tar.gz" |sed -n '/21/p' |sed -n 1p |sed 's/v//g' |sed 's/.tar.gz//g')"
wget https://downloads.openwrt.org/releases/${latest_version}/targets/x86/64/packages/Packages.gz
diff --git a/SEED/R2S/config.seed b/SEED/R2S/config.seed
index ee9213c24..8768f8415 100644
--- a/SEED/R2S/config.seed
+++ b/SEED/R2S/config.seed
@@ -13,6 +13,7 @@ CONFIG_NODEJS_14=y
CONFIG_NODEJS_ICU_NONE=y
CONFIG_DEVEL=y
CONFIG_TOOLCHAINOPTS=y
+CONFIG_GCC_USE_VERSION_7=y
### BASIC ###
CONFIG_PACKAGE_r8169-firmware=y
@@ -41,9 +42,6 @@ CONFIG_PACKAGE_kmod-ipt-nat6=y
CONFIG_PACKAGE_kmod-tun=y
#CONFIG_PACKAGE_kmod-shortcut-fe=y
#CONFIG_PACKAGE_kmod-fast-classifier=y
-CONFIG_PACKAGE_kmod-usb2=y
-CONFIG_PACKAGE_kmod-usb2-pci=y
-CONFIG_PACKAGE_kmod-usb3=y
# CONFIG_PACKAGE_kmod-shortcut-fe-cm is not set
CONFIG_PACKAGE_libiwinfo=y
CONFIG_PACKAGE_libiwinfo-lua=y
@@ -132,12 +130,6 @@ CONFIG_PACKAGE_subconverter=y
CONFIG_PACKAGE_dnsproxy=y
CONFIG_PACKAGE_luci-app-dnsfilter=y
-### ZRAM ###
-CONFIG_PACKAGE_luci-app-compressed-memory=y
-CONFIG_PACKAGE_zram-swap=n
-CONFIG_PACKAGE_block-mount=y
-CONFIG_PACKAGE_swap-utils=y
-
### THEME ###
CONFIG_PACKAGE_luci-theme-argon=y
CONFIG_PACKAGE_luci-theme-bootstrap=y
diff --git a/SEED/R4S/config.seed b/SEED/R4S/config.seed
index fe01cd954..e38091c00 100644
--- a/SEED/R4S/config.seed
+++ b/SEED/R4S/config.seed
@@ -13,6 +13,7 @@ CONFIG_NODEJS_14=y
CONFIG_NODEJS_ICU_NONE=y
CONFIG_DEVEL=y
CONFIG_TOOLCHAINOPTS=y
+CONFIG_GCC_USE_VERSION_7=y
### BASIC ###
CONFIG_PACKAGE_r8169-firmware=y
diff --git a/SEED/X86/config.seed b/SEED/X86/config.seed
index 3c2c51f02..ec530887a 100644
--- a/SEED/X86/config.seed
+++ b/SEED/X86/config.seed
@@ -14,6 +14,7 @@ CONFIG_NODEJS_14=y
CONFIG_NODEJS_ICU_NONE=y
CONFIG_DEVEL=y
CONFIG_TOOLCHAINOPTS=y
+CONFIG_GCC_USE_VERSION_7=y
### BASIC ###
CONFIG_PACKAGE_addition-trans-zh=y
From 5f813331ca9b00a54351fd5fff4e1a414260cdec Mon Sep 17 00:00:00 2001
From: Topsy Chen <62056970+thomaswcy@users.noreply.github.com>
Date: Thu, 19 Aug 2021 12:35:32 +0800
Subject: [PATCH 178/244] Add Support for NanoPi R2C (#220)
---
.github/workflows/R2C-OpenWrt.yml | 147 ++++++++++++++
.github/workflows/R2S-OpenWrt.yml | 4 +-
.github/workflows/R4S-OpenWrt.yml | 4 +-
.github/workflows/X86-OpenWrt.yml | 4 +-
...U-temperature-for-thermal-throttling.patch | 4 +-
README.md | 13 +-
SCRIPTS/R2C/02_target_only.sh | 50 +++++
SEED/R2C/config.seed | 189 ++++++++++++++++++
8 files changed, 401 insertions(+), 14 deletions(-)
create mode 100644 .github/workflows/R2C-OpenWrt.yml
create mode 100644 SCRIPTS/R2C/02_target_only.sh
create mode 100644 SEED/R2C/config.seed
diff --git a/.github/workflows/R2C-OpenWrt.yml b/.github/workflows/R2C-OpenWrt.yml
new file mode 100644
index 000000000..2c8e8c023
--- /dev/null
+++ b/.github/workflows/R2C-OpenWrt.yml
@@ -0,0 +1,147 @@
+name: R2C-OpenWrt
+
+on:
+# schedule:
+# - cron: 5 6 * * 0
+ watch:
+ types: started
+
+jobs:
+ build:
+ runs-on: ubuntu-20.04
+ if: github.event.repository.owner.id == github.event.sender.id
+
+ steps:
+ - name: Checkout
+ uses: actions/checkout@main
+ - name: Show system
+ run: |
+ echo -e "Total CPU cores\t: $(nproc)"
+ cat /proc/cpuinfo | grep 'model name'
+ free -h
+ uname -a
+ [ -f /proc/version ] && cat /proc/version
+ [ -f /etc/issue.net ] && cat /etc/issue.net
+ [ -f /etc/issue ] && cat /etc/issue
+ ulimit -a
+ - name: Free disk space
+ run: |
+ sudo -E swapoff -a
+ sudo -E rm -f /swapfile
+ sudo -E docker image prune -a -f
+ sudo -E snap set system refresh.retain=2
+ sudo -E apt-get -y purge azure* dotnet* firefox ghc* google* hhvm llvm* mono* mysql* openjdk* php* zulu*
+ sudo -E apt-get -y autoremove --purge
+ sudo -E apt-get clean
+ sudo -E rm -rf /usr/share/dotnet /usr/local/lib/android/sdk /etc/mysql /etc/php /usr/local/share/boost
+ [ -n "$AGENT_TOOLSDIRECTORY" ] && sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+ df -h
+ - name: Init build dependencies
+ env:
+ DEBIAN_FRONTEND: noninteractive
+ run: |
+ sudo -E rm -rf /etc/apt/sources.list.d
+ sudo -E apt-get update -y
+ sudo -E apt-get install -y build-essential rsync asciidoc binutils bzip2 gawk gettext git libncurses5-dev libz-dev patch unzip zlib1g-dev lib32gcc1 libc6-dev-i386 subversion flex uglifyjs git-core p7zip p7zip-full msmtp libssl-dev texinfo libreadline-dev libglib2.0-dev xmlto qemu-utils upx libelf-dev autoconf automake libtool autopoint ccache curl wget vim nano python3 python3-pip python3-ply haveged lrzsz device-tree-compiler scons
+ wget -qO - https://raw.githubusercontent.com/friendlyarm/build-env-on-ubuntu-bionic/master/install.sh | sed 's/python-/python3-/g' | /bin/bash
+ sudo -E apt-get clean -y
+ git config --global user.name 'GitHub Actions' && git config --global user.email 'noreply@github.com'
+ df -h
+ - name: Prepare Mixedwrt
+ run: |
+ sudo chown -R runner:runner /home/runner/work/YAOF
+ cp -r ./SCRIPTS/R2C/. ./SCRIPTS/
+ cp -r ./SCRIPTS/. ./
+ /bin/bash 01_get_ready.sh
+ - name: Prepare Package
+ run: |
+ cd openwrt
+ rm -rf target/linux/rockchip
+ svn co https://github.com/immortalwrt/immortalwrt/branches/master/target/linux/rockchip target/linux/rockchip
+ rm -rf package/boot
+ svn co https://github.com/immortalwrt/immortalwrt/branches/master/package/boot package/boot
+ cp -r ../SCRIPTS/. ./
+ /bin/bash 02_prepare_package.sh
+ /bin/bash 02_target_only.sh
+ - name: Convert Translation
+ run: |
+ cd openwrt
+ /bin/bash 03_convert_translation.sh
+ - name: Add ACL
+ run: |
+ cd openwrt
+ /bin/bash 05_create_acl_for_luci.sh -a
+ - name: Make Config
+ run: |
+ cd openwrt
+ mv ../SEED/R2C/config.seed .config
+ make defconfig
+ - name: Smart chmod
+ run: |
+ MY_Filter=$(mktemp)
+ echo '/\.git' > ${MY_Filter}
+ echo '/\.svn' >> ${MY_Filter}
+ find ./ -maxdepth 1 | grep -v '\./$' | grep -v '/\.git' | xargs -s1024 chmod -R u=rwX,og=rX
+ find ./ -type f | grep -v -f ${MY_Filter} | xargs -s1024 file | grep 'executable\|ELF' | cut -d ':' -f1 | xargs -s1024 chmod 755
+ rm -f ${MY_Filter}
+ unset MY_Filter
+ - name: Make Download
+ run: |
+ df -h
+ cd openwrt
+ make download -j50
+ - name: Make Toolchain
+ run: |
+ df -h
+ cd openwrt
+ let make_process=$(nproc)+2
+ make toolchain/install -j${make_process} || make toolchain/install -j${make_process} V=s
+ - name: Compile Openwrt
+ id: compileopenwrt
+ continue-on-error: true
+ run: |
+ df -h
+ cd openwrt
+ let Make_Process=$(nproc)+2
+ make -j${Make_Process} || make -j${Make_Process} V=s
+ echo $?
+ - name: If Error
+ if: steps.compileopenwrt.outcome == 'failure'
+ run: |
+ cat openwrt/.config
+ echo '================================================================'
+ cd openwrt && make -j1 V=s
+ - name: Print Disk Space After
+ run: df -h
+ - name: Organize files
+ id: organize
+ run: |
+ rm -rf ./artifact/
+ mkdir -p ./artifact/
+ mv openwrt/bin/targets/rockchip/armv8/*sysupgrade.img* ./artifact/
+ cd ./artifact/
+ ls -Ahl
+ gzip -d *.gz && exit 0
+ gzip --best *.img
+ ls -Ahl
+ sha256sum openwrt*r2s* | tee R2C-GC404-$(date +%Y-%m-%d)-21.02.sha256sum
+ zip R2C-GC404-$(date +%Y-%m-%d)-21.02-ext4.zip *r2s*ext4*
+ zip R2C-GC404-$(date +%Y-%m-%d)-21.02-sfs.zip *r2s*squashfs*
+ ls -Ahl
+ - name: Upload artifact
+ uses: actions/upload-artifact@main
+ with:
+ name: OpenWRT_21.02
+ path: ./artifact/
+
+ - name: Create release
+ id: create_release
+ uses: ncipollo/release-action@v1.8.0
+ with:
+ name: OpenWRT-21.02.0-RC4
+ allowUpdates: true
+ tag: 21.02.0-rc4
+ commit: master
+ replacesArtifacts: true
+ token: ${{ secrets.workflow_token }}
+ artifacts: ./artifact/*.zip
diff --git a/.github/workflows/R2S-OpenWrt.yml b/.github/workflows/R2S-OpenWrt.yml
index 683a7cd00..98db410fc 100644
--- a/.github/workflows/R2S-OpenWrt.yml
+++ b/.github/workflows/R2S-OpenWrt.yml
@@ -49,7 +49,7 @@ jobs:
df -h
- name: Prepare Mixedwrt
run: |
- sudo chown -R runner:runner /home/runner/work/R2S-R4S-X86-OpenWrt
+ sudo chown -R runner:runner /home/runner/work/YAOF
cp -r ./SCRIPTS/R2S/. ./SCRIPTS/
cp -r ./SCRIPTS/. ./
/bin/bash 01_get_ready.sh
@@ -136,7 +136,7 @@ jobs:
id: create_release
uses: ncipollo/release-action@v1.8.0
with:
- name: OpenWRT-R2S-R4S-X86-21.02.0-RC4
+ name: OpenWRT-21.02.0-RC4
allowUpdates: true
tag: 21.02.0-rc4
commit: master
diff --git a/.github/workflows/R4S-OpenWrt.yml b/.github/workflows/R4S-OpenWrt.yml
index 23551c7c4..1973c97cf 100644
--- a/.github/workflows/R4S-OpenWrt.yml
+++ b/.github/workflows/R4S-OpenWrt.yml
@@ -49,7 +49,7 @@ jobs:
df -h
- name: Prepare Mixedwrt
run: |
- sudo chown -R runner:runner /home/runner/work/R2S-R4S-X86-OpenWrt
+ sudo chown -R runner:runner /home/runner/work/YAOF
cp -r ./SCRIPTS/R4S/. ./SCRIPTS/
cp -r ./SCRIPTS/. ./
/bin/bash 01_get_ready.sh
@@ -136,7 +136,7 @@ jobs:
id: create_release
uses: ncipollo/release-action@v1.8.0
with:
- name: OpenWRT-R2S-R4S-X86-21.02.0-RC4
+ name: OpenWRT-21.02.0-RC4
allowUpdates: true
tag: 21.02.0-rc4
commit: master
diff --git a/.github/workflows/X86-OpenWrt.yml b/.github/workflows/X86-OpenWrt.yml
index 74bd3dab5..650a14652 100644
--- a/.github/workflows/X86-OpenWrt.yml
+++ b/.github/workflows/X86-OpenWrt.yml
@@ -49,7 +49,7 @@ jobs:
df -h
- name: Prepare Mixedwrt
run: |
- sudo chown -R runner:runner /home/runner/work/R2S-R4S-X86-OpenWrt
+ sudo chown -R runner:runner /home/runner/work/YAOF
cp -r ./SCRIPTS/X86/. ./SCRIPTS/
cp -r ./SCRIPTS/. ./
/bin/bash 01_get_ready.sh
@@ -136,7 +136,7 @@ jobs:
id: create_release
uses: ncipollo/release-action@v1.8.0
with:
- name: OpenWRT-R2S-R4S-X86-21.02.0-RC4
+ name: OpenWRT-21.02.0-RC4
allowUpdates: true
tag: 21.02.0-rc4
commit: master
diff --git a/PATCH/target_r4s/213-RK3399-set-critical-CPU-temperature-for-thermal-throttling.patch b/PATCH/target_r4s/213-RK3399-set-critical-CPU-temperature-for-thermal-throttling.patch
index dca0a5d82..c4ba0dfed 100644
--- a/PATCH/target_r4s/213-RK3399-set-critical-CPU-temperature-for-thermal-throttling.patch
+++ b/PATCH/target_r4s/213-RK3399-set-critical-CPU-temperature-for-thermal-throttling.patch
@@ -16,13 +16,13 @@ index 2551b238b97c6..a53ff24d92bad 100644
trips {
cpu_alert0: cpu_alert0 {
- temperature = <70000>;
-+ temperature = <80000>;
++ temperature = <75000>;
hysteresis = <2000>;
type = "passive";
};
cpu_alert1: cpu_alert1 {
- temperature = <75000>;
-+ temperature = <85000>;
++ temperature = <80000>;
hysteresis = <2000>;
type = "passive";
};
diff --git a/README.md b/README.md
index 0eb683747..539b8e3d1 100644
--- a/README.md
+++ b/README.md
@@ -5,13 +5,14 @@
-
-
+
+
-
-
-
+
+
+
+
@@ -26,7 +27,7 @@
- 内置升级功能可用,物理 Reset 按键可用
- 预配置了部分插件(包括但不限于 DNS 套娃,(注意,6月29日开始取消了dns套娃,使用dnsfilter作为广告过滤手段,使用dnsproxy作为dns分流措施,海外端口5335,国内端口6050。)
- 可无脑 opkg kmod
-- R2S核心频率1.6(交换了LAN WAN),R4S核心频率2.2/1.8(建议使用5v4a电源,死机大多数情况下,都是因为你用的电源过于垃圾,另外,你也可以选择使用自带的app限制最大频率,茄子🍆)
+- R2C/R2S核心频率1.6(交换了LAN WAN),R4S核心频率2.2/1.8(建议使用5v4a电源,死机大多数情况下,都是因为你用的电源过于垃圾,另外,你也可以选择使用自带的app限制最大频率,茄子🍆)
- O3 编译,CFLAG优化
- 插件包含:SSRP,PassWall,OpenClash,AdguardHome,微信推送,网易云解锁,SQM,DNSProxy,网络唤醒,DDNS,迅雷快鸟,UPNP,FullCone(防火墙中开启,默认开启),流量分载,irq优化,京东签到,Zerotier,FRPC,FRPS,无线打印,流量监控,过滤军刀,R2S-OLED
- ss协议在armv8上实现了aes硬件加速(请仅使用aead加密的连接方式)
diff --git a/SCRIPTS/R2C/02_target_only.sh b/SCRIPTS/R2C/02_target_only.sh
new file mode 100644
index 000000000..20896c3bc
--- /dev/null
+++ b/SCRIPTS/R2C/02_target_only.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+clear
+
+# 使用专属优化
+sed -i 's,-mcpu=generic,-mcpu=cortex-a53+crypto,g' include/target.mk
+cp -f ../PATCH/mbedtls/100-Implements-AES-and-GCM-with-ARMv8-Crypto-Extensions.patch ./package/libs/mbedtls/patches/100-Implements-AES-and-GCM-with-ARMv8-Crypto-Extensions.patch
+
+#增加风扇调速(被动散热不香么?
+wget -P target/linux/rockchip/armv8/base-files/etc/init.d/ https://github.com/friendlyarm/friendlywrt/raw/master-v19.07.1/target/linux/rockchip-rk3328/base-files/etc/init.d/fa-rk3328-pwmfan
+wget -P target/linux/rockchip/armv8/base-files/usr/bin/ https://github.com/friendlyarm/friendlywrt/raw/master-v19.07.1/target/linux/rockchip-rk3328/base-files/usr/bin/start-rk3328-pwm-fan.sh
+
+# CacULE
+sed -i '/CONFIG_NR_CPUS/d' ./target/linux/rockchip/armv8/config-5.4
+echo '
+CONFIG_NR_CPUS=4
+' >> ./target/linux/rockchip/armv8/config-5.4
+
+# UKSM
+echo '
+CONFIG_KSM=y
+CONFIG_UKSM=y
+' >> ./target/linux/rockchip/armv8/config-5.4
+
+# 配置 IRQ 并默认关闭 eth0 offloading rx/rx
+sed -i '/set_interface_core 4 "eth1"/a\set_interface_core 8 "ff160000" "ff160000.i2c"' target/linux/rockchip/armv8/base-files/etc/hotplug.d/net/40-net-smp-affinity
+sed -i '/set_interface_core 4 "eth1"/a\set_interface_core 1 "ff150000" "ff150000.i2c"' target/linux/rockchip/armv8/base-files/etc/hotplug.d/net/40-net-smp-affinity
+sed -i '/;;/i\ethtool -K eth0 rx off tx off && logger -t disable-offloading "disabed rk3328 ethernet tcp/udp offloading tx/rx"' target/linux/rockchip/armv8/base-files/etc/hotplug.d/net/40-net-smp-affinity
+
+# 交换 LAN/WAN 口
+sed -i 's,"eth1" "eth0","eth0" "eth1",g' target/linux/rockchip/armv8/base-files/etc/board.d/02_network
+sed -i "s,'eth1' 'eth0','eth0' 'eth1',g" target/linux/rockchip/armv8/base-files/etc/board.d/02_network
+
+# 添加 R2S GPU 驱动
+rm -rf ./package/kernel/linux/modules/video.mk
+wget -P package/kernel/linux/modules/ https://github.com/immortalwrt/immortalwrt/raw/master/package/kernel/linux/modules/video.mk
+
+#Vermagic
+latest_version="$(curl -s https://github.com/openwrt/openwrt/releases |grep -Eo "v[0-9\.]+\-*r*c*[0-9]*.tar.gz" |sed -n '/21/p' |sed -n 1p |sed 's/v//g' |sed 's/.tar.gz//g')"
+wget https://downloads.openwrt.org/releases/${latest_version}/targets/rockchip/armv8/packages/Packages.gz
+zgrep -m 1 "Depends: kernel (=.*)$" Packages.gz | sed -e 's/.*-\(.*\))/\1/' > .vermagic
+sed -i -e 's/^\(.\).*vermagic$/\1cp $(TOPDIR)\/.vermagic $(LINUX_DIR)\/.vermagic/' include/kernel-defaults.mk
+
+# 预配置一些插件
+cp -rf ../PATCH/files ./files
+
+chmod -R 755 ./
+find ./ -name *.orig | xargs rm -f
+find ./ -name *.rej | xargs rm -f
+
+#exit 0
diff --git a/SEED/R2C/config.seed b/SEED/R2C/config.seed
new file mode 100644
index 000000000..67d17824a
--- /dev/null
+++ b/SEED/R2C/config.seed
@@ -0,0 +1,189 @@
+### INITIALIZATION ###
+CONFIG_KERNEL_BUILD_DOMAIN="buildhost"
+CONFIG_KERNEL_BUILD_USER="builder"
+CONFIG_TARGET_rockchip=y
+CONFIG_TARGET_rockchip_armv8=y
+CONFIG_TARGET_rockchip_armv8_DEVICE_friendlyarm_nanopi-r2c=y
+CONFIG_TARGET_ROOTFS_PARTSIZE=864
+# CONFIG_COLLECT_KERNEL_DEBUG is not set
+
+### AGGRESSIVE ###
+CONFIG_PACKAGE_node=y
+CONFIG_NODEJS_14=y
+CONFIG_NODEJS_ICU_NONE=y
+CONFIG_DEVEL=y
+CONFIG_TOOLCHAINOPTS=y
+CONFIG_GCC_USE_VERSION_7=y
+
+### BASIC ###
+CONFIG_PACKAGE_r8169-firmware=y
+CONFIG_PACKAGE_addition-trans-zh=y
+CONFIG_PACKAGE_arm-trusted-firmware-rockchip=y
+CONFIG_PACKAGE_autocore-arm=y
+CONFIG_PACKAGE_ipv6helper=y
+CONFIG_PACKAGE_cgi-io=y
+CONFIG_PACKAGE_ethtool=y
+CONFIG_PACKAGE_diffutils=y
+CONFIG_PACKAGE_dnsmasq-full=y
+CONFIG_PACKAGE_dnsmasq_full_auth=y
+CONFIG_PACKAGE_dnsmasq_full_conntrack=y
+CONFIG_PACKAGE_dnsmasq_full_dhcp=y
+CONFIG_PACKAGE_dnsmasq_full_dhcpv6=y
+CONFIG_PACKAGE_dnsmasq_full_dnssec=y
+CONFIG_PACKAGE_dnsmasq_full_ipset=y
+CONFIG_PACKAGE_dnsmasq_full_noid=y
+CONFIG_PACKAGE_dnsmasq_full_tftp=y
+# CONFIG_PACKAGE_dnsmasq is not set
+CONFIG_PACKAGE_iptables-mod-fullconenat=y
+CONFIG_PACKAGE_ip6tables-mod-fullconenat=y
+CONFIG_PACKAGE_iptables-mod-nat-extra=y
+CONFIG_PACKAGE_kmod-fs-f2fs=y
+CONFIG_PACKAGE_kmod-ipt-nat6=y
+CONFIG_PACKAGE_kmod-tun=y
+#CONFIG_PACKAGE_kmod-shortcut-fe=y
+#CONFIG_PACKAGE_kmod-fast-classifier=y
+# CONFIG_PACKAGE_kmod-shortcut-fe-cm is not set
+CONFIG_PACKAGE_libiwinfo=y
+CONFIG_PACKAGE_libiwinfo-lua=y
+CONFIG_PACKAGE_liblua=y
+CONFIG_PACKAGE_liblucihttp=y
+CONFIG_PACKAGE_liblucihttp-lua=y
+CONFIG_PACKAGE_libubus-lua=y
+CONFIG_PACKAGE_libustream-openssl=y
+# CONFIG_PACKAGE_libustream-wolfssl is not set
+CONFIG_PACKAGE_rpcd=y
+CONFIG_PACKAGE_rpcd-mod-file=y
+CONFIG_PACKAGE_rpcd-mod-iwinfo=y
+CONFIG_PACKAGE_rpcd-mod-luci=y
+CONFIG_PACKAGE_rpcd-mod-rrdns=y
+CONFIG_PACKAGE_uhttpd=y
+CONFIG_PACKAGE_wget-ssl=y
+CONFIG_PACKAGE_lua=y
+# CONFIG_PACKAGE_miniupnpd is not set
+CONFIG_PACKAGE_miniupnpd-igdv1=y
+
+### LUCI ###
+CONFIG_LUCI_LANG_zh_Hans=y
+CONFIG_LUCI_LANG_en=y
+# CONFIG_LUCI_CSSTIDY is not set
+# CONFIG_LUCI_JSMIN is not set
+CONFIG_PACKAGE_luci=y
+CONFIG_PACKAGE_luci-app-firewall=y
+CONFIG_PACKAGE_luci-base=y
+CONFIG_PACKAGE_luci-compat=y
+CONFIG_PACKAGE_luci-lib-ip=y
+CONFIG_PACKAGE_luci-lib-ipkg=y
+CONFIG_PACKAGE_luci-lib-jsonc=y
+CONFIG_PACKAGE_luci-lib-nixio=y
+CONFIG_PACKAGE_luci-mod-admin-full=y
+CONFIG_PACKAGE_luci-mod-network=y
+CONFIG_PACKAGE_luci-mod-status=y
+CONFIG_PACKAGE_luci-mod-system=y
+CONFIG_PACKAGE_luci-proto-ipv6=y
+CONFIG_PACKAGE_luci-proto-ppp=y
+
+### APPLICATION ###
+#CONFIG_PACKAGE_luci-app-adguardhome=y
+#CONFIG_PACKAGE_adguardhome=y
+CONFIG_PACKAGE_luci-app-wireguard=y
+CONFIG_PACKAGE_luci-app-socat=y
+CONFIG_PACKAGE_luci-app-argon-config=y
+CONFIG_PACKAGE_luci-app-arpbind=y
+CONFIG_PACKAGE_luci-app-autoreboot=y
+CONFIG_PACKAGE_luci-app-boostupnp=y
+CONFIG_PACKAGE_luci-app-control-weburl=y
+CONFIG_PACKAGE_luci-app-cpufreq=y
+CONFIG_PACKAGE_luci-app-cpulimit=y
+CONFIG_PACKAGE_luci-app-ddns=y
+CONFIG_PACKAGE_ddns-scripts-cloudflare=y
+CONFIG_PACKAGE_ddns-scripts-dnspod=y
+CONFIG_PACKAGE_ddns-scripts-aliyun=y
+CONFIG_PACKAGE_luci-app-frpc=y
+CONFIG_PACKAGE_frpc=y
+CONFIG_PACKAGE_luci-app-frps=y
+CONFIG_PACKAGE_frps=y
+CONFIG_PACKAGE_luci-app-jd-dailybonus=y
+CONFIG_PACKAGE_luci-app-dnsproxy=y
+#CONFIG_PACKAGE_luci-app-oled=y
+CONFIG_PACKAGE_luci-app-openclash=y
+CONFIG_PACKAGE_luci-app-opkg=y
+#CONFIG_PACKAGE_luci-app-qbittorrent=y
+CONFIG_PACKAGE_luci-app-ramfree=y
+CONFIG_PACKAGE_luci-app-serverchan=y
+#CONFIG_PACKAGE_luci-app-smartdns=y
+CONFIG_PACKAGE_luci-app-sqm=y
+CONFIG_PACKAGE_luci-app-vlmcsd=y
+CONFIG_PACKAGE_luci-app-vssr=y
+CONFIG_PACKAGE_luci-app-ssr-plus=y
+CONFIG_PACKAGE_luci-app-passwall=y
+CONFIG_PACKAGE_luci-app-unblockneteasemusic=y
+CONFIG_PACKAGE_luci-app-upnp=y
+CONFIG_PACKAGE_luci-app-usb-printer=y
+CONFIG_PACKAGE_luci-app-services-wolplus=y
+CONFIG_PACKAGE_luci-app-wrtbwmon=y
+CONFIG_PACKAGE_luci-app-xlnetacc=y
+CONFIG_PACKAGE_luci-app-zerotier=y
+CONFIG_PACKAGE_zerotier=y
+CONFIG_PACKAGE_coremark=y
+CONFIG_PACKAGE_htop=y
+CONFIG_PACKAGE_subconverter=y
+CONFIG_PACKAGE_dnsproxy=y
+CONFIG_PACKAGE_luci-app-dnsfilter=y
+
+### THEME ###
+CONFIG_PACKAGE_luci-theme-argon=y
+CONFIG_PACKAGE_luci-theme-bootstrap=y
+
+### RESET ###
+CONFIG_PACKAGE_kmod-gpio-button-hotplug=y
+
+### SHARE NETWORK ###
+CONFIG_PACKAGE_usbmuxd=y
+CONFIG_PACKAGE_kmod-usb-net-rndis=y
+CONFIG_PACKAGE_kmod-usb-net-ipheth=y
+
+### BBRv2 ###
+CONFIG_PACKAGE_kmod-tcp-bbr2=y
+
+### OPENSSL ###
+CONFIG_OPENSSL_ENGINE=y
+CONFIG_OPENSSL_OPTIMIZE_SPEED=y
+CONFIG_OPENSSL_WITH_ASM=y
+# CONFIG_OPENSSL_WITH_ERROR_MESSAGES is not set
+CONFIG_PACKAGE_libopenssl-devcrypto=y
+CONFIG_PACKAGE_libopenssl-conf=y
+CONFIG_PACKAGE_openssl-util=y
+CONFIG_PACKAGE_libcurl=y
+CONFIG_LIBCURL_OPENSSL=y
+# CONFIG_LIBCURL_MBEDTLS is not set
+# CONFIG_LIBCURL_WOLFSSL is not set
+# CONFIG_LIBCURL_GNUTLS is not set
+# CONFIG_LIBCURL_NOSSL is not set
+
+
+### DOCKER ###(实验性,有需要的删掉下面的注释)
+
+#CONFIG_PACKAGE_luci-app-dockerman=y
+#CONFIG_DOCKER_KERNEL_OPTIONS=y
+#CONFIG_DOCKER_NET_ENCRYPT=y
+#CONFIG_DOCKER_NET_MACVLAN=y
+#CONFIG_DOCKER_NET_OVERLAY=y
+#CONFIG_DOCKER_NET_TFTP=y
+#CONFIG_DOCKER_RES_SHAPE=y
+#CONFIG_DOCKER_SECCOMP=y
+#CONFIG_DOCKER_STO_BTRFS=y
+#CONFIG_DOCKER_STO_EXT4=y
+
+### Video Support ### (实验性,有需要的删掉下面的注释)
+#CONFIG_PACKAGE_kmod-backlight=y
+#CONFIG_PACKAGE_kmod-drm=y
+#CONFIG_PACKAGE_kmod-drm-kms-helper=y
+#CONFIG_PACKAGE_kmod-drm-rockchip=y
+#CONFIG_PACKAGE_kmod-fb=y
+#CONFIG_PACKAGE_kmod-fb-cfb-copyarea=y
+#CONFIG_PACKAGE_kmod-fb-cfb-fillrect=y
+#CONFIG_PACKAGE_kmod-fb-cfb-imgblt=y
+#CONFIG_PACKAGE_kmod-fb-sys-fops=y
+#CONFIG_PACKAGE_kmod-fb-sys-ram=y
+#CONFIG_PACKAGE_kmod-gpu-lima=y
+#CONFIG_PACKAGE_kmod-multimedia-input=y
From e2a2b4c89954e94bbec42cdd48433eec9a0196ad Mon Sep 17 00:00:00 2001
From: QiuSimons <45143996+QiuSimons@users.noreply.github.com>
Date: Thu, 19 Aug 2021 14:41:48 +0800
Subject: [PATCH 179/244] Strip Unnecessary Files
---
.github/workflows/R2C-OpenWrt.yml | 8 ++---
SCRIPTS/02_prepare_package.sh | 1 +
SCRIPTS/R2C/02_target_only.sh | 50 -------------------------------
3 files changed, 5 insertions(+), 54 deletions(-)
delete mode 100644 SCRIPTS/R2C/02_target_only.sh
diff --git a/.github/workflows/R2C-OpenWrt.yml b/.github/workflows/R2C-OpenWrt.yml
index 2c8e8c023..2aefd5a03 100644
--- a/.github/workflows/R2C-OpenWrt.yml
+++ b/.github/workflows/R2C-OpenWrt.yml
@@ -50,7 +50,7 @@ jobs:
- name: Prepare Mixedwrt
run: |
sudo chown -R runner:runner /home/runner/work/YAOF
- cp -r ./SCRIPTS/R2C/. ./SCRIPTS/
+ cp -r ./SCRIPTS/R2S/. ./SCRIPTS/
cp -r ./SCRIPTS/. ./
/bin/bash 01_get_ready.sh
- name: Prepare Package
@@ -124,9 +124,9 @@ jobs:
gzip -d *.gz && exit 0
gzip --best *.img
ls -Ahl
- sha256sum openwrt*r2s* | tee R2C-GC404-$(date +%Y-%m-%d)-21.02.sha256sum
- zip R2C-GC404-$(date +%Y-%m-%d)-21.02-ext4.zip *r2s*ext4*
- zip R2C-GC404-$(date +%Y-%m-%d)-21.02-sfs.zip *r2s*squashfs*
+ sha256sum openwrt*r2c* | tee R2C-GC404-$(date +%Y-%m-%d)-21.02.sha256sum
+ zip R2C-GC404-$(date +%Y-%m-%d)-21.02-ext4.zip *r2c*ext4*
+ zip R2C-GC404-$(date +%Y-%m-%d)-21.02-sfs.zip *r2c*squashfs*
ls -Ahl
- name: Upload artifact
uses: actions/upload-artifact@main
diff --git a/SCRIPTS/02_prepare_package.sh b/SCRIPTS/02_prepare_package.sh
index 934c8bae3..cafe45f6e 100644
--- a/SCRIPTS/02_prepare_package.sh
+++ b/SCRIPTS/02_prepare_package.sh
@@ -38,6 +38,7 @@ wget -qO - https://github.com/openwrt/openwrt/commit/cfaf039.patch | patch -p1
# CacULE
wget -qO - https://github.com/QiuSimons/openwrt-NoTengoBattery/commit/7d44cab.patch | patch -p1
wget https://github.com/hamadmarri/cacule-cpu-scheduler/raw/master/patches/CacULE/v5.4/cacule-5.4.patch -O ./target/linux/generic/hack-5.4/694-cacule-5.4.patch
+#wget https://github.com/hamadmarri/cacule-cpu-scheduler/raw/vR2.1/patches/CacULE/v5.4/cacule-5.4.patch -O ./target/linux/generic/hack-5.4/694-cacule-5.4.patch
# UKSM
cp -f ../PATCH/UKSM/695-uksm-5.4.patch ./target/linux/generic/hack-5.4/695-uksm-5.4.patch
# Grub 2
diff --git a/SCRIPTS/R2C/02_target_only.sh b/SCRIPTS/R2C/02_target_only.sh
deleted file mode 100644
index 20896c3bc..000000000
--- a/SCRIPTS/R2C/02_target_only.sh
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/bin/bash
-clear
-
-# 使用专属优化
-sed -i 's,-mcpu=generic,-mcpu=cortex-a53+crypto,g' include/target.mk
-cp -f ../PATCH/mbedtls/100-Implements-AES-and-GCM-with-ARMv8-Crypto-Extensions.patch ./package/libs/mbedtls/patches/100-Implements-AES-and-GCM-with-ARMv8-Crypto-Extensions.patch
-
-#增加风扇调速(被动散热不香么?
-wget -P target/linux/rockchip/armv8/base-files/etc/init.d/ https://github.com/friendlyarm/friendlywrt/raw/master-v19.07.1/target/linux/rockchip-rk3328/base-files/etc/init.d/fa-rk3328-pwmfan
-wget -P target/linux/rockchip/armv8/base-files/usr/bin/ https://github.com/friendlyarm/friendlywrt/raw/master-v19.07.1/target/linux/rockchip-rk3328/base-files/usr/bin/start-rk3328-pwm-fan.sh
-
-# CacULE
-sed -i '/CONFIG_NR_CPUS/d' ./target/linux/rockchip/armv8/config-5.4
-echo '
-CONFIG_NR_CPUS=4
-' >> ./target/linux/rockchip/armv8/config-5.4
-
-# UKSM
-echo '
-CONFIG_KSM=y
-CONFIG_UKSM=y
-' >> ./target/linux/rockchip/armv8/config-5.4
-
-# 配置 IRQ 并默认关闭 eth0 offloading rx/rx
-sed -i '/set_interface_core 4 "eth1"/a\set_interface_core 8 "ff160000" "ff160000.i2c"' target/linux/rockchip/armv8/base-files/etc/hotplug.d/net/40-net-smp-affinity
-sed -i '/set_interface_core 4 "eth1"/a\set_interface_core 1 "ff150000" "ff150000.i2c"' target/linux/rockchip/armv8/base-files/etc/hotplug.d/net/40-net-smp-affinity
-sed -i '/;;/i\ethtool -K eth0 rx off tx off && logger -t disable-offloading "disabed rk3328 ethernet tcp/udp offloading tx/rx"' target/linux/rockchip/armv8/base-files/etc/hotplug.d/net/40-net-smp-affinity
-
-# 交换 LAN/WAN 口
-sed -i 's,"eth1" "eth0","eth0" "eth1",g' target/linux/rockchip/armv8/base-files/etc/board.d/02_network
-sed -i "s,'eth1' 'eth0','eth0' 'eth1',g" target/linux/rockchip/armv8/base-files/etc/board.d/02_network
-
-# 添加 R2S GPU 驱动
-rm -rf ./package/kernel/linux/modules/video.mk
-wget -P package/kernel/linux/modules/ https://github.com/immortalwrt/immortalwrt/raw/master/package/kernel/linux/modules/video.mk
-
-#Vermagic
-latest_version="$(curl -s https://github.com/openwrt/openwrt/releases |grep -Eo "v[0-9\.]+\-*r*c*[0-9]*.tar.gz" |sed -n '/21/p' |sed -n 1p |sed 's/v//g' |sed 's/.tar.gz//g')"
-wget https://downloads.openwrt.org/releases/${latest_version}/targets/rockchip/armv8/packages/Packages.gz
-zgrep -m 1 "Depends: kernel (=.*)$" Packages.gz | sed -e 's/.*-\(.*\))/\1/' > .vermagic
-sed -i -e 's/^\(.\).*vermagic$/\1cp $(TOPDIR)\/.vermagic $(LINUX_DIR)\/.vermagic/' include/kernel-defaults.mk
-
-# 预配置一些插件
-cp -rf ../PATCH/files ./files
-
-chmod -R 755 ./
-find ./ -name *.orig | xargs rm -f
-find ./ -name *.rej | xargs rm -f
-
-#exit 0
From ff7d6e6dc21062b6ed7cad3f40d7fa7c4a7c4878 Mon Sep 17 00:00:00 2001
From: QiuSimons <45143996+QiuSimons@users.noreply.github.com>
Date: Mon, 23 Aug 2021 17:05:12 +0800
Subject: [PATCH 180/244] Update dnsfilter
---
PATCH/files/etc/config/dnsfilter | 5 ++---
SCRIPTS/02_prepare_package.sh | 3 +++
2 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/PATCH/files/etc/config/dnsfilter b/PATCH/files/etc/config/dnsfilter
index 0998cdd46..6fbc3f71d 100644
--- a/PATCH/files/etc/config/dnsfilter
+++ b/PATCH/files/etc/config/dnsfilter
@@ -4,7 +4,6 @@ config dnsfilter
option time_update '6'
option flash '0'
option enable '1'
- list url 'https://cdn.jsdelivr.net/gh/AdguardTeam/FiltersRegistry@master/filters/filter_2_English/filter.txt'
- list url 'https://cdn.jsdelivr.net/gh/AdguardTeam/FiltersRegistry@master/filters/filter_224_Chinese/filter.txt'
- list url 'https://cdn.jsdelivr.net/gh/AdguardTeam/FiltersRegistry@master/filters/filter_14_Annoyances/filter.txt'
+ list url 'https://cdn.jsdelivr.net/gh/AdguardTeam/AdGuardSDNSFilter@gh-pages/Filters/filter.txt'
+ list url 'https://cdn.jsdelivr.net/gh/easylist/easylistchina@master/easylistchina.txt'
diff --git a/SCRIPTS/02_prepare_package.sh b/SCRIPTS/02_prepare_package.sh
index cafe45f6e..95250684d 100644
--- a/SCRIPTS/02_prepare_package.sh
+++ b/SCRIPTS/02_prepare_package.sh
@@ -340,6 +340,9 @@ svn co https://github.com/immortalwrt/packages/trunk/libs/quickjspp feeds/packag
ln -sf ../../../feeds/packages/libs/quickjspp ./package/feeds/packages/quickjspp
# 网易云音乐解锁
git clone --depth 1 https://github.com/immortalwrt/luci-app-unblockneteasemusic.git package/new/UnblockNeteaseMusic
+#pushd package/new/UnblockNeteaseMusic
+#wget -qO - https://github.com/immortalwrt/luci-app-unblockneteasemusic/pull/78.patch | patch -p1
+#popd
# USB 打印机
svn co https://github.com/coolsnowwolf/lede/trunk/package/lean/luci-app-usb-printer package/lean/luci-app-usb-printer
# UU加速器
From 8a7c85ef702be80002c41c913fb42f29fb71224c Mon Sep 17 00:00:00 2001
From: QiuSimons <45143996+QiuSimons@users.noreply.github.com>
Date: Wed, 25 Aug 2021 18:11:56 +0800
Subject: [PATCH 181/244] Add LRNG Support
---
PATCH/LRNG/696-01-v41-000001-sha1.patch | 17 +
.../696-02-v41-000002-GRND_INSECURE.patch | 13 +
.../696-03-v41-000003-jent-backport.patch | 80 +
...1-0001-Linux-Random-Number-Generator.patch | 4162 +++++++++++++++++
...cate-one-DRNG-instance-per-NUMA-node.patch | 201 +
...0003-LRNG-sysctls-and-proc-interface.patch | 287 ++
...004-LRNG-add-switchable-DRNG-support.patch | 326 ++
...LRNG-add-common-generic-hash-support.patch | 198 +
...-externalize-DRBG-functions-for-LRNG.patch | 139 +
...07-LRNG-add-SP800-90A-DRBG-extension.patch | 278 ++
...add-kernel-crypto-API-PRNG-extension.patch | 307 ++
...de-access-to-a-static-Jitter-RNG-sta.patch | 148 +
...RNG-add-Jitter-RNG-fast-noise-source.patch | 174 +
...add-SP800-90B-compliant-health-tests.patch | 623 +++
...terface-for-gathering-of-raw-entropy.patch | 956 ++++
...-add-power-on-and-runtime-self-tests.patch | 494 ++
.../696-17-v41-01-remove_compat_ioctl.patch | 18 +
...-v41-02-jitterentropy_remove_RCT_APT.patch | 15 +
.../696-19-v41-03-arch_random_early.patch | 13 +
PATCH/LRNG/696-20-v41-04-net_rand_state.patch | 10 +
PATCH/LRNG/696-21-v41-05-kzfree.patch | 44 +
PATCH/LRNG/696-22-v41-06-sha-includes.patch | 32 +
SCRIPTS/02_prepare_package.sh | 6 +
23 files changed, 8541 insertions(+)
create mode 100644 PATCH/LRNG/696-01-v41-000001-sha1.patch
create mode 100644 PATCH/LRNG/696-02-v41-000002-GRND_INSECURE.patch
create mode 100644 PATCH/LRNG/696-03-v41-000003-jent-backport.patch
create mode 100644 PATCH/LRNG/696-04-v41-0001-Linux-Random-Number-Generator.patch
create mode 100644 PATCH/LRNG/696-05-v41-0002-LRNG-allocate-one-DRNG-instance-per-NUMA-node.patch
create mode 100644 PATCH/LRNG/696-06-v41-0003-LRNG-sysctls-and-proc-interface.patch
create mode 100644 PATCH/LRNG/696-07-v41-0004-LRNG-add-switchable-DRNG-support.patch
create mode 100644 PATCH/LRNG/696-08-v41-0005-LRNG-add-common-generic-hash-support.patch
create mode 100644 PATCH/LRNG/696-09-v41-0006-crypto-DRBG-externalize-DRBG-functions-for-LRNG.patch
create mode 100644 PATCH/LRNG/696-10-v41-0007-LRNG-add-SP800-90A-DRBG-extension.patch
create mode 100644 PATCH/LRNG/696-11-v41-0008-LRNG-add-kernel-crypto-API-PRNG-extension.patch
create mode 100644 PATCH/LRNG/696-12-v41-0009-crypto-provide-access-to-a-static-Jitter-RNG-sta.patch
create mode 100644 PATCH/LRNG/696-13-v41-0010-LRNG-add-Jitter-RNG-fast-noise-source.patch
create mode 100644 PATCH/LRNG/696-14-v41-0011-LRNG-add-SP800-90B-compliant-health-tests.patch
create mode 100644 PATCH/LRNG/696-15-v41-0012-LRNG-add-interface-for-gathering-of-raw-entropy.patch
create mode 100644 PATCH/LRNG/696-16-v41-0013-LRNG-add-power-on-and-runtime-self-tests.patch
create mode 100644 PATCH/LRNG/696-17-v41-01-remove_compat_ioctl.patch
create mode 100644 PATCH/LRNG/696-18-v41-02-jitterentropy_remove_RCT_APT.patch
create mode 100644 PATCH/LRNG/696-19-v41-03-arch_random_early.patch
create mode 100644 PATCH/LRNG/696-20-v41-04-net_rand_state.patch
create mode 100644 PATCH/LRNG/696-21-v41-05-kzfree.patch
create mode 100644 PATCH/LRNG/696-22-v41-06-sha-includes.patch
diff --git a/PATCH/LRNG/696-01-v41-000001-sha1.patch b/PATCH/LRNG/696-01-v41-000001-sha1.patch
new file mode 100644
index 000000000..f4f5010e4
--- /dev/null
+++ b/PATCH/LRNG/696-01-v41-000001-sha1.patch
@@ -0,0 +1,17 @@
+--- a/include/crypto/sha.h
++++ b/include/crypto/sha.h
+@@ -159,4 +159,14 @@ extern int sha224_update(struct sha256_s
+ unsigned int length);
+ extern int sha224_final(struct sha256_state *sctx, u8 *hash);
+
++/*
++ * An implementation of SHA-1's compression function. Don't use in new code!
++ * You shouldn't be using SHA-1, and even if you *have* to use SHA-1, this isn't
++ * the correct way to hash something with SHA-1 (use crypto_shash instead).
++ */
++#define SHA1_DIGEST_WORDS (SHA1_DIGEST_SIZE / 4)
++#define SHA1_WORKSPACE_WORDS 16
++void sha_init(__u32 *buf);
++void sha_transform(__u32 *digest, const char *data, __u32 *W);
++
+ #endif
diff --git a/PATCH/LRNG/696-02-v41-000002-GRND_INSECURE.patch b/PATCH/LRNG/696-02-v41-000002-GRND_INSECURE.patch
new file mode 100644
index 000000000..268235d11
--- /dev/null
+++ b/PATCH/LRNG/696-02-v41-000002-GRND_INSECURE.patch
@@ -0,0 +1,13 @@
+--- a/include/uapi/linux/random.h
++++ b/include/uapi/linux/random.h
+@@ -49,8 +49,10 @@ struct rand_pool_info {
+ *
+ * GRND_NONBLOCK Don't block and return EAGAIN instead
+ * GRND_RANDOM Use the /dev/random pool instead of /dev/urandom
++ * GRND_INSECURE Return non-cryptographic random bytes
+ */
+ #define GRND_NONBLOCK 0x0001
+ #define GRND_RANDOM 0x0002
++#define GRND_INSECURE 0x0004
+
+ #endif /* _UAPI_LINUX_RANDOM_H */
diff --git a/PATCH/LRNG/696-03-v41-000003-jent-backport.patch b/PATCH/LRNG/696-03-v41-000003-jent-backport.patch
new file mode 100644
index 000000000..0feef6dfe
--- /dev/null
+++ b/PATCH/LRNG/696-03-v41-000003-jent-backport.patch
@@ -0,0 +1,80 @@
+commit 965d7286d871b622dcaaafd2e2346b11631584ff
+Author: Ben Dooks
+Date: Wed Oct 9 10:12:56 2019 +0100
+
+ crypto: jitter - add header to fix buildwarnings
+
+ Fix the following build warnings by adding a header for
+ the definitions shared between jitterentropy.c and
+ jitterentropy-kcapi.c. Fixes the following:
+
+ crypto/jitterentropy.c:445:5: warning: symbol 'jent_read_entropy' was not declared. Should it be static?
+ crypto/jitterentropy.c:475:18: warning: symbol 'jent_entropy_collector_alloc' was not declared. Should it be static?
+ crypto/jitterentropy.c:509:6: warning: symbol 'jent_entropy_collector_free' was not declared. Should it be static?
+ crypto/jitterentropy.c:516:5: warning: symbol 'jent_entropy_init' was not declared. Should it be static?
+ crypto/jitterentropy-kcapi.c:59:6: warning: symbol 'jent_zalloc' was not declared. Should it be static?
+ crypto/jitterentropy-kcapi.c:64:6: warning: symbol 'jent_zfree' was not declared. Should it be static?
+ crypto/jitterentropy-kcapi.c:69:5: warning: symbol 'jent_fips_enabled' was not declared. Should it be static?
+ crypto/jitterentropy-kcapi.c:74:6: warning: symbol 'jent_panic' was not declared. Should it be static?
+ crypto/jitterentropy-kcapi.c:79:6: warning: symbol 'jent_memcpy' was not declared. Should it be static?
+ crypto/jitterentropy-kcapi.c:93:6: warning: symbol 'jent_get_nstime' was not declared. Should it be static?
+
+ Signed-off-by: Ben Dooks
+ Reviewed-by: Stephan Mueller
+
+--- a/crypto/jitterentropy-kcapi.c
++++ b/crypto/jitterentropy-kcapi.c
+@@ -44,13 +44,7 @@
+ #include
+ #include
+
+-struct rand_data;
+-int jent_read_entropy(struct rand_data *ec, unsigned char *data,
+- unsigned int len);
+-int jent_entropy_init(void);
+-struct rand_data *jent_entropy_collector_alloc(unsigned int osr,
+- unsigned int flags);
+-void jent_entropy_collector_free(struct rand_data *entropy_collector);
++#include "jitterentropy.h"
+
+ /***************************************************************************
+ * Helper function
+--- a/crypto/jitterentropy.c
++++ b/crypto/jitterentropy.c
+@@ -103,12 +103,7 @@ struct rand_data {
+ * Helper functions
+ ***************************************************************************/
+
+-void jent_get_nstime(__u64 *out);
+-void *jent_zalloc(unsigned int len);
+-void jent_zfree(void *ptr);
+-int jent_fips_enabled(void);
+-void jent_panic(char *s);
+-void jent_memcpy(void *dest, const void *src, unsigned int n);
++#include "jitterentropy.h"
+
+ /**
+ * Update of the loop count used for the next round of
+--- /dev/null
++++ b/crypto/jitterentropy.h
+@@ -0,0 +1,19 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++
++typedef unsigned long long __u64;
++
++extern void *jent_zalloc(unsigned int len);
++extern void jent_zfree(void *ptr);
++extern int jent_fips_enabled(void);
++extern void jent_panic(char *s);
++extern void jent_memcpy(void *dest, const void *src, unsigned int n);
++extern void jent_get_nstime(__u64 *out);
++
++struct rand_data;
++extern int jent_entropy_init(void);
++extern int jent_read_entropy(struct rand_data *ec, unsigned char *data,
++ unsigned int len);
++
++extern struct rand_data *jent_entropy_collector_alloc(unsigned int osr,
++ unsigned int flags);
++extern void jent_entropy_collector_free(struct rand_data *entropy_collector);
diff --git a/PATCH/LRNG/696-04-v41-0001-Linux-Random-Number-Generator.patch b/PATCH/LRNG/696-04-v41-0001-Linux-Random-Number-Generator.patch
new file mode 100644
index 000000000..3fa0088a8
--- /dev/null
+++ b/PATCH/LRNG/696-04-v41-0001-Linux-Random-Number-Generator.patch
@@ -0,0 +1,4162 @@
+From 13918cf8b7d4f1383ff2a61687d45441b03b3b74 Mon Sep 17 00:00:00 2001
+From: Stephan Mueller
+Date: Wed, 23 Jun 2021 18:42:39 +0200
+Subject: [PATCH v41 01/13] Linux Random Number Generator
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+In an effort to provide a flexible implementation for a random number
+generator that also delivers entropy during early boot time, allows
+replacement of the deterministic random number generation mechanism,
+implement the various components in separate code for easier
+maintenance, and provide compliance to SP800-90[A|B|C], introduce
+the Linux Random Number Generator (LRNG) framework.
+
+The general design is as follows. Additional implementation details
+are given in [1]. The LRNG consists of the following components:
+
+1. The LRNG implements a DRNG. The DRNG always generates the
+requested amount of output. When using the SP800-90A terminology
+it operates without prediction resistance. The secondary DRNG
+maintains a counter of how many bytes were generated since last
+re-seed and a timer of the elapsed time since last re-seed. If either
+the counter or the timer reaches a threshold, the secondary DRNG is
+seeded from the entropy pool.
+
+In case the Linux kernel detects a NUMA system, one secondary DRNG
+instance per NUMA node is maintained.
+
+2. The DRNG is seeded by concatenating the data from the
+following sources:
+
+(a) the output of the entropy pool,
+
+(b) the Jitter RNG if available and enabled, and
+
+(c) the CPU-based noise source such as Intel RDRAND if available and
+enabled.
+
+The entropy estimate of the data of all noise sources are added to
+form the entropy estimate of the data used to seed the DRNG with.
+The LRNG ensures, however, that the DRNG after seeding is at
+maximum the security strength of the DRNG.
+
+The LRNG is designed such that none of these noise sources can dominate
+the other noise sources to provide seed data to the DRNG during due to
+the following:
+
+(a) During boot time, the amount of received interrupts are the trigger
+points to (re)seed the DRNG.
+
+(b) At runtime, the available entropy from the slow noise source is
+concatenated with a pre-defined amount of data from the fast noise
+sources. In addition, each DRNG reseed operation triggers external
+noise source providers to deliver one block of data.
+
+3. The entropy pool accumulates entropy obtained from certain events,
+which will henceforth be collectively called "slow noise sources".
+The entropy pool collects noise data from slow noise sources. Any data
+received by the LRNG from the slow noise sources is inserted into a
+per-CPU entropy pool using a hash operation that can be changed during
+runtime. Per default, SHA-256 is used.
+
+ (a) When an interrupt occurs, the high-resolution time stamp is mixed
+into the per-CPU entropy pool. This time stamp is credited with
+heuristically implied entropy.
+
+ (b) HID event data like the key stroke or the mouse coordinates are
+mixed into the per-CPU entropy pool. This data is not credited with
+entropy by the LRNG.
+
+ (c) Device drivers may provide data that is mixed into an auxiliary
+pool using the same hash that is used to process the per-CPU entropy
+pool. This data is not credited with entropy by the LRNG.
+
+Any data provided from user space by either writing to /dev/random,
+/dev/urandom or the IOCTL of RNDADDENTROPY on both device files
+are always injected into the auxiliary pool.
+
+In addition, when a hardware random number generator covered by the
+Linux kernel HW generator framework wants to deliver random numbers,
+it is injected into the auxiliary pool as well. HW generator noise source
+is handled separately from the other noise source due to the fact that
+the HW generator framework may decide by itself when to deliver data
+whereas the other noise sources always requested for data driven by the
+LRNG operation. Similarly any user space provided data is inserted into
+the entropy pool.
+
+When seed data for the DRNG is to be generated, all per-CPU
+entropy pools and the auxiliary pool are hashed. The message digest
+forms the new auxiliary pool state. At the same time, this data
+is used for seeding the DRNG.
+
+To speed up the interrupt handling code of the LRNG, the time stamp
+collected for an interrupt event is truncated to the 8 least
+significant bits. 64 truncated time stamps are concatenated and then
+jointly inserted into the per-CPU entropy pool. During boot time,
+until the fully seeded stage is reached, each time stamp with its
+32 least significant bits is are concatenated. When 16 such events
+are received, they are injected into the per-CPU entropy pool.
+
+The LRNG allows the DRNG mechanism to be changed at runtime. Per default,
+a ChaCha20-based DRNG is used. The ChaCha20-DRNG implemented for the
+LRNG is also provided as a stand-alone user space deterministic random
+number generator. The LRNG also offers an SP800-90A DRBG based on the
+Linux kernel crypto API DRBG implementation.
+
+The processing of entropic data from the noise source before injecting
+them into the DRNG is performed with the following mathematical
+operations:
+
+1. Truncation: The received time stamps are truncated to 8 least
+significant bits (or 32 least significant bits during boot time)
+
+2. Concatenation: The received and truncated time stamps as well as
+auxiliary 32 bit words are concatenated to fill the per-CPU data
+array that is capable of holding 64 8-bit words.
+
+3. Hashing: A set of concatenated time stamp data received from the
+interrupts are hashed together with the current existing per-CPU
+entropy pool state. The resulting message digest is the new per-CPU
+entropy pool state.
+
+4. Hashing: When new data is added to the auxiliary pool, the data
+is hashed together with the auxiliary pool to form a new auxiliary
+pool state.
+
+5. Hashing: A message digest of all per-CPU entropy pools and the
+auxiliary pool is calculated which forms the new auxiliary pool
+state. At the same time, this message digest is used to fill the
+slow noise source output buffer discussed in the following.
+
+6. Truncation: The most-significant bits (MSB) defined by the
+requested number of bits (commonly equal to the security strength
+of the DRBG) or the entropy available transported with the buffer
+(which is the minimum of the message digest size and the available
+entropy in all entropy pools and the auxiliary pool), whatever is
+smaller, are obtained from the slow noise source output buffer.
+
+7. Concatenation: The temporary seed buffer used to seed the DRNG
+is a concatenation of the slow noise source buffer, the Jitter RNG
+output, the CPU noise source output, and the current time.
+
+The DRNG always tries to seed itself with 256 bits of entropy, except
+during boot. In any case, if the noise sources cannot deliver that
+amount, the available entropy is used and the DRNG keeps track on how
+much entropy it was seeded with. The entropy implied by the LRNG
+available in the entropy pool may be too conservative. To ensure
+that during boot time all available entropy from the entropy pool is
+transferred to the DRNG, the hash_df function always generates 256
+data bits during boot to seed the DRNG. During boot, the DRNG is
+seeded as follows:
+
+1. The DRNG is reseeded from the entropy pool and potentially the fast
+noise sources if the entropy pool has collected at least 32 bits of
+entropy from the interrupt noise source. The goal of this step is to
+ensure that the DRNG receives some initial entropy as early as
+possible. In addition it receives the entropy available from
+the fast noise sources.
+
+2. The DRNG is reseeded from the entropy pool and potentially the fast
+noise sources if all noise sources collectively can provide at least
+128 bits of entropy.
+
+3. The DRNG is reseeded from the entropy pool and potentially the fast
+noise sources if all noise sources collectivel can provide at least 256
+bits.
+
+At the time of the reseeding steps, the DRNG requests as much entropy as
+is available in order to skip certain steps and reach the seeding level
+of 256 bits. This may imply that one or more of the aforementioned steps
+are skipped.
+
+In all listed steps, the DRNG is (re)seeded with a number of random
+bytes from the entropy pool that is at most the amount of entropy
+present in the entropy pool. This means that when the entropy pool
+contains 128 or 256 bits of entropy, the DRNG is seeded with that
+amount of entropy as well.
+
+Before the DRNG is seeded with 256 bits of entropy in step 3,
+requests of random data from /dev/random and the getrandom system
+call are not processed.
+
+The hash operation providing random data from the entropy pools will
+always require that all entropy sources collectively can deliver at
+least 128 entropy bits.
+
+The DRNG operates as deterministic random number generator with the
+following properties:
+
+* The maximum number of random bytes that can be generated with one
+DRNG generate operation is limited to 4096 bytes. When longer random
+numbers are requested, multiple DRNG generate operations are performed.
+The ChaCha20 DRNG as well as the SP800-90A DRBGs implement an update of
+their state after completing a generate request for backtracking
+resistance.
+
+* The secondary DRNG is reseeded with whatever entropy is available –
+in the worst case where no additional entropy can be provided by the
+noise sources, the DRNG is not re-seeded and continues its operation
+to try to reseed again after again the expiry of one of these thresholds:
+
+ - If the last reseeding of the secondary DRNG is more than 600 seconds
+ ago, or
+
+ - 2^20 DRNG generate operations are performed, whatever comes first, or
+
+ - the secondary DRNG is forced to reseed before the next generation of
+ random numbers if data has been injected into the LRNG by writing data
+ into /dev/random or /dev/urandom.
+
+The chosen values prevent high-volume requests from user space to cause
+frequent reseeding operations which drag down the performance of the
+DRNG.
+
+With the automatic reseeding after 600 seconds, the LRNG is triggered
+to reseed itself before the first request after a suspend that put the
+hardware to sleep for longer than 600 seconds.
+
+To support smaller devices including IoT environments, this patch
+allows reducing the runtime memory footprint of the LRNG at compile
+time by selecting smaller collection data sizes.
+
+When selecting the compilation of a kernel for a small environment,
+prevent the allocation of a buffer up to 4096 bytes to serve user space
+requests. In this case, the stack variable of 64 bytes is used to serve
+all user space requests.
+
+The LRNG has the following properties:
+
+* internal noise source: interrupts timing with fast boot time seeding
+
+* high performance of interrupt handling code: The LRNG impact on the
+interrupt handling has been reduced to a minimum. On one example
+system, the LRNG interrupt handling code in its fastest configuration
+executes within an average 55 cycles whereas the existing
+/dev/random on the same device takes about 97 cycles when measuring
+the execution time of add_interrupt_randomness().
+
+* use of almost never contended lock for hashing operation to collect
+ raw entropy supporting concurrency-free use of massive parallel
+ systems - worst case rate of contention is the number of DRNG
+ reseeds, usually: number of NUMA nodes contentions per 5 minutes.
+
+* use of standalone ChaCha20 based RNG with the option to use a
+ different DRNG selectable at compile time
+
+* instantiate one DRNG per NUMA node
+
+* support for runtime switchable output DRNGs
+
+* use of runtime-switchable hash for conditioning implementation
+following widely accepted approach
+
+* compile-time selectable collection size
+
+* support of small systems by allowing the reduction of the
+runtime memory needs
+
+Further details including the rationale for the design choices and
+properties of the LRNG together with testing is provided at [1].
+In addition, the documentation explains the conducted regression
+tests to verify that the LRNG is API and ABI compatible with the
+existing /dev/random implementation.
+
+[1] https://www.chronox.de/lrng.html
+
+CC: Torsten Duwe
+CC: "Eric W. Biederman"
+CC: "Alexander E. Patrakov"
+CC: "Ahmed S. Darwish"
+CC: "Theodore Y. Ts'o"
+CC: Willy Tarreau
+CC: Matthew Garrett
+CC: Vito Caputo
+CC: Andreas Dilger
+CC: Jan Kara
+CC: Ray Strode
+CC: William Jon McCann
+CC: zhangjs
+CC: Andy Lutomirski
+CC: Florian Weimer
+CC: Lennart Poettering
+CC: Nicolai Stange
+CC: Alexander Lobakin
+Mathematical aspects Reviewed-by: "Peter, Matthias"
+Reviewed-by: Marcelo Henrique Cerri
+Reviewed-by: Roman Drahtmueller
+Tested-by: Marcelo Henrique Cerri
+Tested-by: Neil Horman
+Signed-off-by: Stephan Mueller
+---
+ MAINTAINERS | 7 +
+ drivers/char/Kconfig | 2 +
+ drivers/char/Makefile | 9 +-
+ drivers/char/lrng/Kconfig | 205 ++++++++
+ drivers/char/lrng/Makefile | 9 +
+ drivers/char/lrng/lrng_archrandom.c | 91 ++++
+ drivers/char/lrng/lrng_aux.c | 136 ++++++
+ drivers/char/lrng/lrng_chacha20.c | 321 +++++++++++++
+ drivers/char/lrng/lrng_chacha20.h | 29 ++
+ drivers/char/lrng/lrng_drng.c | 422 +++++++++++++++++
+ drivers/char/lrng/lrng_interfaces.c | 650 ++++++++++++++++++++++++++
+ drivers/char/lrng/lrng_internal.h | 416 +++++++++++++++++
+ drivers/char/lrng/lrng_pool.c | 622 ++++++++++++++++++++++++
+ drivers/char/lrng/lrng_sw_noise.c | 702 ++++++++++++++++++++++++++++
+ drivers/char/lrng/lrng_sw_noise.h | 71 +++
+ include/linux/lrng.h | 81 ++++
+ 16 files changed, 3772 insertions(+), 1 deletion(-)
+ create mode 100644 drivers/char/lrng/Kconfig
+ create mode 100644 drivers/char/lrng/Makefile
+ create mode 100644 drivers/char/lrng/lrng_archrandom.c
+ create mode 100644 drivers/char/lrng/lrng_aux.c
+ create mode 100644 drivers/char/lrng/lrng_chacha20.c
+ create mode 100644 drivers/char/lrng/lrng_chacha20.h
+ create mode 100644 drivers/char/lrng/lrng_drng.c
+ create mode 100644 drivers/char/lrng/lrng_interfaces.c
+ create mode 100644 drivers/char/lrng/lrng_internal.h
+ create mode 100644 drivers/char/lrng/lrng_pool.c
+ create mode 100644 drivers/char/lrng/lrng_sw_noise.c
+ create mode 100644 drivers/char/lrng/lrng_sw_noise.h
+ create mode 100644 include/linux/lrng.h
+
+--- a/MAINTAINERS
++++ b/MAINTAINERS
+@@ -9494,6 +9494,13 @@ F: Documentation/core-api/atomic_ops.rst
+ F: Documentation/core-api/refcount-vs-atomic.rst
+ F: Documentation/memory-barriers.txt
+
++LINUX RANDOM NUMBER GENERATOR (LRNG) DRIVER
++M: Stephan Mueller
++S: Maintained
++W: https://www.chronox.de/lrng.html
++F: drivers/char/lrng/*
++F: include/linux/lrng.h
++
+ LIS3LV02D ACCELEROMETER DRIVER
+ M: Eric Piel
+ S: Maintained
+--- a/drivers/char/Kconfig
++++ b/drivers/char/Kconfig
+@@ -535,6 +535,8 @@ config ADI
+ and SSM (Silicon Secured Memory). Intended consumers of this
+ driver include crash and makedumpfile.
+
++source "drivers/char/lrng/Kconfig"
++
+ endmenu
+
+ config RANDOM_TRUST_CPU
+--- a/drivers/char/Makefile
++++ b/drivers/char/Makefile
+@@ -3,7 +3,14 @@
+ # Makefile for the kernel character device drivers.
+ #
+
+-obj-y += mem.o random.o
++obj-y += mem.o
++
++ifeq ($(CONFIG_LRNG),y)
++ obj-y += lrng/
++else
++ obj-y += random.o
++endif
++
+ obj-$(CONFIG_TTY_PRINTK) += ttyprintk.o
+ obj-y += misc.o
+ obj-$(CONFIG_ATARI_DSP56K) += dsp56k.o
+--- /dev/null
++++ b/drivers/char/lrng/Kconfig
+@@ -0,0 +1,205 @@
++# SPDX-License-Identifier: GPL-2.0
++#
++# Linux Random Number Generator configuration
++#
++
++menuconfig LRNG
++ bool "Linux Random Number Generator"
++ select CRYPTO_LIB_SHA256 if CRYPTO
++ help
++ The Linux Random Number Generator (LRNG) is the replacement
++ of the existing /dev/random provided with drivers/char/random.c.
++ It generates entropy from different noise sources and
++ delivers significant entropy during boot.
++
++if LRNG
++
++menu "Specific DRNG seeding strategies"
++
++config LRNG_OVERSAMPLE_ENTROPY_SOURCES
++ bool "Oversample entropy sources"
++ default n
++ help
++ When enabling this option, the entropy sources are
++ over-sampled with the following approach: First, the
++ the entropy sources are requested to provide 64 bits more
++ entropy than the size of the entropy buffer. For example,
++ if the entropy buffer is 256 bits, 320 bits of entropy
++ is requested to fill that buffer.
++
++ Second, the seed operation of the deterministic RNG
++ requests 128 bits more data from each entropy source than
++ the security strength of the DRNG during initialization.
++ A prerequisite for this operation is that the digest size
++ of the used hash must be at least equally large to generate
++ that buffer. If the prerequisite is not met, this
++ oversampling is not applied.
++
++ This strategy is intended to offset the asymptotic entropy
++ increase to reach full entropy in a buffer.
++
++ The strategy is consistent with the requirements in
++ NIST SP800-90C and is only enforced with fips=1.
++
++ If unsure, say N.
++
++config LRNG_OVERSAMPLE_ES_BITS
++ int
++ default 0 if !LRNG_OVERSAMPLE_ENTROPY_SOURCES
++ default 64 if LRNG_OVERSAMPLE_ENTROPY_SOURCES
++
++config LRNG_SEED_BUFFER_INIT_ADD_BITS
++ int
++ default 0 if !LRNG_OVERSAMPLE_ENTROPY_SOURCES
++ default 128 if LRNG_OVERSAMPLE_ENTROPY_SOURCES
++
++endmenu # "Specific DRNG seeding strategies"
++
++menu "Entropy Source Configuration"
++
++comment "Interrupt Entropy Source"
++
++choice
++ prompt "Continuous entropy compression boot time setting"
++ default LRNG_CONTINUOUS_COMPRESSION_ENABLED
++ help
++ Select the default behavior of the interrupt entropy source
++ continuous compression operation.
++
++ The Linux RNG collects entropy data during each interrupt.
++ For performance reasons, a amount of entropy data defined by
++ the LRNG entropy collection pool size is concatenated into
++ an array. When that array is filled up, a hash is calculated
++ to compress the entropy. That hash is calculated in
++ interrupt context.
++
++ In case such hash calculation in interrupt context is deemed
++ too time-consuming, the continuous compression operation
++ can be disabled. If disabled, the collection of entropy will
++ not trigger a hash compression operation in interrupt context.
++ The compression happens only when the DRNG is reseeded which is
++ in process context. This implies that old entropy data
++ collected after the last DRNG-reseed is overwritten with newer
++ entropy data once the collection pool is full instead of
++ retaining its entropy with the compression operation.
++
++ config LRNG_CONTINUOUS_COMPRESSION_ENABLED
++ bool "Enable continuous compression (default)"
++
++ config LRNG_CONTINUOUS_COMPRESSION_DISABLED
++ bool "Disable continuous compression"
++endchoice
++
++config LRNG_ENABLE_CONTINUOUS_COMPRESSION
++ bool
++ default y if LRNG_CONTINUOUS_COMPRESSION_ENABLED
++ default n if LRNG_CONTINUOUS_COMPRESSION_DISABLED
++
++config LRNG_SWITCHABLE_CONTINUOUS_COMPRESSION
++ bool "Runtime-switchable continuous entropy compression"
++ help
++ Per default, the interrupt entropy source continuous
++ compression operation behavior is hard-wired into the kernel.
++ Enable this option to allow it to be configurable at boot time.
++
++ To modify the default behavior of the continuous
++ compression operation, use the kernel command line option
++ of lrng_sw_noise.lrng_pcpu_continuous_compression.
++
++ If unsure, say N.
++
++choice
++ prompt "LRNG Entropy Collection Pool Size"
++ default LRNG_COLLECTION_SIZE_1024
++ help
++ Select the size of the LRNG entropy collection pool
++ storing data for the interrupt entropy source without
++ performing a compression operation. The larger the
++ collection size is, the faster the average interrupt
++ handling will be. The collection size represents the
++ number of bytes of the per-CPU memory used to batch
++ up entropy event data.
++
++ The default value is good for regular operations. Choose
++ larger sizes for servers that have no memory limitations.
++ If runtime memory is precious, choose a smaller size.
++
++ The collection size is unrelated to the entropy rate
++ or the amount of entropy the LRNG can process.
++
++ config LRNG_COLLECTION_SIZE_32
++ depends on LRNG_CONTINUOUS_COMPRESSION_ENABLED
++ depends on !LRNG_SWITCHABLE_CONTINUOUS_COMPRESSION
++ depends on !LRNG_OVERSAMPLE_ENTROPY_SOURCES
++ bool "32 interrupt events"
++
++ config LRNG_COLLECTION_SIZE_256
++ depends on !LRNG_OVERSAMPLE_ENTROPY_SOURCES
++ bool "256 interrupt events"
++
++ config LRNG_COLLECTION_SIZE_512
++ bool "512 interrupt events"
++
++ config LRNG_COLLECTION_SIZE_1024
++ bool "1024 interrupt events (default)"
++
++ config LRNG_COLLECTION_SIZE_2048
++ bool "2048 interrupt events"
++
++ config LRNG_COLLECTION_SIZE_4096
++ bool "4096 interrupt events"
++
++ config LRNG_COLLECTION_SIZE_8192
++ bool "8192 interrupt events"
++
++endchoice
++
++config LRNG_COLLECTION_SIZE
++ int
++ default 32 if LRNG_COLLECTION_SIZE_32
++ default 256 if LRNG_COLLECTION_SIZE_256
++ default 512 if LRNG_COLLECTION_SIZE_512
++ default 1024 if LRNG_COLLECTION_SIZE_1024
++ default 2048 if LRNG_COLLECTION_SIZE_2048
++ default 4096 if LRNG_COLLECTION_SIZE_4096
++ default 8192 if LRNG_COLLECTION_SIZE_8192
++
++config LRNG_IRQ_ENTROPY_RATE
++ int "Interrupt Entropy Source Entropy Rate"
++ range 256 4294967295
++ default 256
++ help
++ The LRNG will collect the configured number of interrupts to
++ obtain 256 bits of entropy. This value can be set to any between
++ 256 and 4294967295. The LRNG guarantees that this value is not
++ lower than 256. This lower limit implies that one interrupt event
++ is credited with one bit of entropy. This value is subject to the
++ increase by the oversampling factor, if no high-resolution timer
++ is found.
++
++ In order to effectively disable the interrupt entropy source,
++ the option has to be set to 4294967295. In this case, the
++ interrupt entropy source will still deliver data but without
++ being credited with entropy.
++
++comment "CPU Entropy Source"
++
++config LRNG_CPU_ENTROPY_RATE
++ int "CPU Entropy Source Entropy Rate"
++ range 0 256
++ default 8
++ help
++ The option defines the amount of entropy the LRNG applies to 256
++ bits of data obtained from the CPU entropy source. The LRNG
++ enforces the limit that this value must be in the range between
++ 0 and 256.
++
++ In order to disable the CPU entropy source, the option has to
++ be set to 0.
++
++ Note, this option is overwritten when the option
++ CONFIG_RANDOM_TRUST_CPU is set.
++
++endmenu # "Entropy Source Configuration"
++
++endif # LRNG
+--- /dev/null
++++ b/drivers/char/lrng/Makefile
+@@ -0,0 +1,9 @@
++# SPDX-License-Identifier: GPL-2.0
++#
++# Makefile for the Linux Random Number Generator.
++#
++
++obj-y += lrng_pool.o lrng_aux.o \
++ lrng_sw_noise.o lrng_archrandom.o \
++ lrng_drng.o lrng_chacha20.o \
++ lrng_interfaces.o
+--- /dev/null
++++ b/drivers/char/lrng/lrng_archrandom.c
+@@ -0,0 +1,91 @@
++// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
++/*
++ * LRNG Fast Entropy Source: CPU-based entropy source
++ *
++ * Copyright (C) 2016 - 2021, Stephan Mueller
++ */
++
++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
++
++#include
++
++#include "lrng_internal.h"
++
++/*
++ * Estimated entropy of data is a 32th of LRNG_DRNG_SECURITY_STRENGTH_BITS.
++ * As we have no ability to review the implementation of those noise sources,
++ * it is prudent to have a conservative estimate here.
++ */
++#define LRNG_ARCHRANDOM_DEFAULT_STRENGTH CONFIG_LRNG_CPU_ENTROPY_RATE
++#define LRNG_ARCHRANDOM_TRUST_CPU_STRENGTH LRNG_DRNG_SECURITY_STRENGTH_BITS
++#ifdef CONFIG_RANDOM_TRUST_CPU
++static u32 archrandom = LRNG_ARCHRANDOM_TRUST_CPU_STRENGTH;
++#else
++static u32 archrandom = LRNG_ARCHRANDOM_DEFAULT_STRENGTH;
++#endif
++#ifdef CONFIG_LRNG_RUNTIME_ES_CONFIG
++module_param(archrandom, uint, 0644);
++MODULE_PARM_DESC(archrandom, "Entropy in bits of 256 data bits from CPU noise source (e.g. RDRAND)");
++#endif
++
++static int __init lrng_parse_trust_cpu(char *arg)
++{
++ int ret;
++ bool trust_cpu = false;
++
++ ret = kstrtobool(arg, &trust_cpu);
++ if (ret)
++ return ret;
++
++ if (trust_cpu) {
++ archrandom = LRNG_ARCHRANDOM_TRUST_CPU_STRENGTH;
++ lrng_pool_add_entropy();
++ } else {
++ archrandom = LRNG_ARCHRANDOM_DEFAULT_STRENGTH;
++ }
++
++ return 0;
++}
++early_param("random.trust_cpu", lrng_parse_trust_cpu);
++
++u32 lrng_archrandom_entropylevel(u32 requested_bits)
++{
++ return lrng_fast_noise_entropylevel(archrandom, requested_bits);
++}
++
++/**
++ * lrng_get_arch() - Get CPU noise source entropy
++ *
++ * @outbuf: buffer to store entropy of size LRNG_DRNG_SECURITY_STRENGTH_BYTES
++ *
++ * Return:
++ * * > 0 on success where value provides the added entropy in bits
++ * * 0 if no fast source was available
++ */
++u32 lrng_get_arch(u8 *outbuf, u32 requested_bits)
++{
++ u32 i, ent_bits = lrng_archrandom_entropylevel(requested_bits);
++
++ /* operate on full blocks */
++ BUILD_BUG_ON(LRNG_DRNG_SECURITY_STRENGTH_BYTES % sizeof(unsigned long));
++ BUILD_BUG_ON(CONFIG_LRNG_SEED_BUFFER_INIT_ADD_BITS %
++ sizeof(unsigned long));
++ /* ensure we have aligned buffers */
++ BUILD_BUG_ON(LRNG_KCAPI_ALIGN % sizeof(unsigned long));
++
++ if (!ent_bits)
++ return 0;
++
++ for (i = 0; i < (requested_bits >> 3);
++ i += sizeof(unsigned long)) {
++ if (!arch_get_random_seed_long((unsigned long *)(outbuf + i)) &&
++ !arch_get_random_long((unsigned long *)(outbuf + i))) {
++ archrandom = 0;
++ return 0;
++ }
++ }
++
++ pr_debug("obtained %u bits of entropy from CPU RNG noise source\n",
++ ent_bits);
++ return ent_bits;
++}
+--- /dev/null
++++ b/drivers/char/lrng/lrng_aux.c
+@@ -0,0 +1,136 @@
++// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
++/*
++ * LRNG auxiliary interfaces
++ *
++ * Copyright (C) 2019 - 2021 Stephan Mueller
++ * Copyright (C) 2017 Jason A. Donenfeld . All
++ * Rights Reserved.
++ * Copyright (C) 2016 Jason Cooper
++ */
++
++#include
++#include
++
++#include "lrng_internal.h"
++
++struct batched_entropy {
++ union {
++ u64 entropy_u64[LRNG_DRNG_BLOCKSIZE / sizeof(u64)];
++ u32 entropy_u32[LRNG_DRNG_BLOCKSIZE / sizeof(u32)];
++ };
++ unsigned int position;
++ spinlock_t batch_lock;
++};
++
++/*
++ * Get a random word for internal kernel use only. The quality of the random
++ * number is as good as /dev/urandom, but there is no backtrack protection,
++ * with the goal of being quite fast and not depleting entropy.
++ */
++static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_u64) = {
++ .batch_lock = __SPIN_LOCK_UNLOCKED(batched_entropy_u64.lock),
++};
++
++u64 get_random_u64(void)
++{
++ u64 ret;
++ unsigned long flags;
++ struct batched_entropy *batch;
++
++ lrng_debug_report_seedlevel("get_random_u64");
++
++ batch = raw_cpu_ptr(&batched_entropy_u64);
++ spin_lock_irqsave(&batch->batch_lock, flags);
++ if (batch->position % ARRAY_SIZE(batch->entropy_u64) == 0) {
++ lrng_drng_get_atomic((u8 *)batch->entropy_u64,
++ LRNG_DRNG_BLOCKSIZE);
++ batch->position = 0;
++ }
++ ret = batch->entropy_u64[batch->position++];
++ spin_unlock_irqrestore(&batch->batch_lock, flags);
++ return ret;
++}
++EXPORT_SYMBOL(get_random_u64);
++
++static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_u32) = {
++ .batch_lock = __SPIN_LOCK_UNLOCKED(batched_entropy_u32.lock),
++};
++
++u32 get_random_u32(void)
++{
++ u32 ret;
++ unsigned long flags;
++ struct batched_entropy *batch;
++
++ lrng_debug_report_seedlevel("get_random_u32");
++
++ batch = raw_cpu_ptr(&batched_entropy_u32);
++ spin_lock_irqsave(&batch->batch_lock, flags);
++ if (batch->position % ARRAY_SIZE(batch->entropy_u32) == 0) {
++ lrng_drng_get_atomic((u8 *)batch->entropy_u32,
++ LRNG_DRNG_BLOCKSIZE);
++ batch->position = 0;
++ }
++ ret = batch->entropy_u32[batch->position++];
++ spin_unlock_irqrestore(&batch->batch_lock, flags);
++ return ret;
++}
++EXPORT_SYMBOL(get_random_u32);
++
++/*
++ * It's important to invalidate all potential batched entropy that might
++ * be stored before the crng is initialized, which we can do lazily by
++ * simply resetting the counter to zero so that it's re-extracted on the
++ * next usage.
++ */
++void invalidate_batched_entropy(void)
++{
++ int cpu;
++ unsigned long flags;
++
++ for_each_possible_cpu(cpu) {
++ struct batched_entropy *batched_entropy;
++
++ batched_entropy = per_cpu_ptr(&batched_entropy_u32, cpu);
++ spin_lock_irqsave(&batched_entropy->batch_lock, flags);
++ batched_entropy->position = 0;
++ spin_unlock(&batched_entropy->batch_lock);
++
++ batched_entropy = per_cpu_ptr(&batched_entropy_u64, cpu);
++ spin_lock(&batched_entropy->batch_lock);
++ batched_entropy->position = 0;
++ spin_unlock_irqrestore(&batched_entropy->batch_lock, flags);
++ }
++}
++
++/**
++ * randomize_page - Generate a random, page aligned address
++ * @start: The smallest acceptable address the caller will take.
++ * @range: The size of the area, starting at @start, within which the
++ * random address must fall.
++ *
++ * If @start + @range would overflow, @range is capped.
++ *
++ * NOTE: Historical use of randomize_range, which this replaces, presumed that
++ * @start was already page aligned. We now align it regardless.
++ *
++ * Return: A page aligned address within [start, start + range). On error,
++ * @start is returned.
++ */
++unsigned long randomize_page(unsigned long start, unsigned long range)
++{
++ if (!PAGE_ALIGNED(start)) {
++ range -= PAGE_ALIGN(start) - start;
++ start = PAGE_ALIGN(start);
++ }
++
++ if (start > ULONG_MAX - range)
++ range = ULONG_MAX - start;
++
++ range >>= PAGE_SHIFT;
++
++ if (range == 0)
++ return start;
++
++ return start + (get_random_long() % range << PAGE_SHIFT);
++}
+--- /dev/null
++++ b/drivers/char/lrng/lrng_chacha20.c
+@@ -0,0 +1,321 @@
++// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
++/*
++ * Backend for the LRNG providing the cryptographic primitives using
++ * ChaCha20 cipher implementations.
++ *
++ * Copyright (C) 2016 - 2021, Stephan Mueller
++ */
++
++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
++
++#include
++#include
++#include
++#include
++
++#include "lrng_chacha20.h"
++#include "lrng_internal.h"
++
++/******************************* ChaCha20 DRNG *******************************/
++
++#define CHACHA_BLOCK_WORDS (CHACHA_BLOCK_SIZE / sizeof(u32))
++
++struct chacha20_state {
++ struct chacha20_block block;
++};
++
++/*
++ * Have a static memory blocks for the ChaCha20 DRNG instance to avoid calling
++ * kmalloc too early in the boot cycle. For subsequent allocation requests,
++ * such as per-NUMA-node DRNG instances, kmalloc will be used.
++ */
++struct chacha20_state chacha20 __latent_entropy;
++
++/**
++ * Update of the ChaCha20 state by either using an unused buffer part or by
++ * generating one ChaCha20 block which is half of the state of the ChaCha20.
++ * The block is XORed into the key part of the state. This shall ensure
++ * backtracking resistance as well as a proper mix of the ChaCha20 state once
++ * the key is injected.
++ */
++static void lrng_chacha20_update(struct chacha20_state *chacha20_state,
++ __le32 *buf, u32 used_words)
++{
++ struct chacha20_block *chacha20 = &chacha20_state->block;
++ u32 i;
++ __le32 tmp[CHACHA_BLOCK_WORDS];
++
++ BUILD_BUG_ON(sizeof(struct chacha20_block) != CHACHA_BLOCK_SIZE);
++ BUILD_BUG_ON(CHACHA_BLOCK_SIZE != 2 * CHACHA_KEY_SIZE);
++
++ if (used_words > CHACHA_KEY_SIZE_WORDS) {
++ chacha20_block(&chacha20->constants[0], (u8 *)tmp);
++ for (i = 0; i < CHACHA_KEY_SIZE_WORDS; i++)
++ chacha20->key.u[i] ^= le32_to_cpu(tmp[i]);
++ memzero_explicit(tmp, sizeof(tmp));
++ } else {
++ for (i = 0; i < CHACHA_KEY_SIZE_WORDS; i++)
++ chacha20->key.u[i] ^= le32_to_cpu(buf[i + used_words]);
++ }
++
++ /* Deterministic increment of nonce as required in RFC 7539 chapter 4 */
++ chacha20->nonce[0]++;
++ if (chacha20->nonce[0] == 0) {
++ chacha20->nonce[1]++;
++ if (chacha20->nonce[1] == 0)
++ chacha20->nonce[2]++;
++ }
++
++ /* Leave counter untouched as it is start value is undefined in RFC */
++}
++
++/*
++ * Seed the ChaCha20 DRNG by injecting the input data into the key part of
++ * the ChaCha20 state. If the input data is longer than the ChaCha20 key size,
++ * perform a ChaCha20 operation after processing of key size input data.
++ * This operation shall spread out the entropy into the ChaCha20 state before
++ * new entropy is injected into the key part.
++ */
++static int lrng_cc20_drng_seed_helper(void *drng, const u8 *inbuf, u32 inbuflen)
++{
++ struct chacha20_state *chacha20_state = (struct chacha20_state *)drng;
++ struct chacha20_block *chacha20 = &chacha20_state->block;
++
++ while (inbuflen) {
++ u32 i, todo = min_t(u32, inbuflen, CHACHA_KEY_SIZE);
++
++ for (i = 0; i < todo; i++)
++ chacha20->key.b[i] ^= inbuf[i];
++
++ /* Break potential dependencies between the inbuf key blocks */
++ lrng_chacha20_update(chacha20_state, NULL,
++ CHACHA_BLOCK_WORDS);
++ inbuf += todo;
++ inbuflen -= todo;
++ }
++
++ return 0;
++}
++
++/*
++ * Chacha20 DRNG generation of random numbers: the stream output of ChaCha20
++ * is the random number. After the completion of the generation of the
++ * stream, the entire ChaCha20 state is updated.
++ *
++ * Note, as the ChaCha20 implements a 32 bit counter, we must ensure
++ * that this function is only invoked for at most 2^32 - 1 ChaCha20 blocks
++ * before a reseed or an update happens. This is ensured by the variable
++ * outbuflen which is a 32 bit integer defining the number of bytes to be
++ * generated by the ChaCha20 DRNG. At the end of this function, an update
++ * operation is invoked which implies that the 32 bit counter will never be
++ * overflown in this implementation.
++ */
++static int lrng_cc20_drng_generate_helper(void *drng, u8 *outbuf, u32 outbuflen)
++{
++ struct chacha20_state *chacha20_state = (struct chacha20_state *)drng;
++ struct chacha20_block *chacha20 = &chacha20_state->block;
++ __le32 aligned_buf[CHACHA_BLOCK_WORDS];
++ u32 ret = outbuflen, used = CHACHA_BLOCK_WORDS;
++ int zeroize_buf = 0;
++
++ while (outbuflen >= CHACHA_BLOCK_SIZE) {
++ chacha20_block(&chacha20->constants[0], outbuf);
++ outbuf += CHACHA_BLOCK_SIZE;
++ outbuflen -= CHACHA_BLOCK_SIZE;
++ }
++
++ if (outbuflen) {
++ chacha20_block(&chacha20->constants[0], (u8 *)aligned_buf);
++ memcpy(outbuf, aligned_buf, outbuflen);
++ used = ((outbuflen + sizeof(aligned_buf[0]) - 1) /
++ sizeof(aligned_buf[0]));
++ zeroize_buf = 1;
++ }
++
++ lrng_chacha20_update(chacha20_state, aligned_buf, used);
++
++ if (zeroize_buf)
++ memzero_explicit(aligned_buf, sizeof(aligned_buf));
++
++ return ret;
++}
++
++void lrng_cc20_init_state(struct chacha20_state *state)
++{
++ lrng_cc20_init_rfc7539(&state->block);
++}
++
++/*
++ * Allocation of the DRNG state
++ */
++static void *lrng_cc20_drng_alloc(u32 sec_strength)
++{
++ struct chacha20_state *state = NULL;
++
++ if (sec_strength > CHACHA_KEY_SIZE) {
++ pr_err("Security strength of ChaCha20 DRNG (%u bits) lower than requested by LRNG (%u bits)\n",
++ CHACHA_KEY_SIZE * 8, sec_strength * 8);
++ return ERR_PTR(-EINVAL);
++ }
++ if (sec_strength < CHACHA_KEY_SIZE)
++ pr_warn("Security strength of ChaCha20 DRNG (%u bits) higher than requested by LRNG (%u bits)\n",
++ CHACHA_KEY_SIZE * 8, sec_strength * 8);
++
++ state = kmalloc(sizeof(struct chacha20_state), GFP_KERNEL);
++ if (!state)
++ return ERR_PTR(-ENOMEM);
++ pr_debug("memory for ChaCha20 core allocated\n");
++
++ lrng_cc20_init_state(state);
++
++ return state;
++}
++
++static void lrng_cc20_drng_dealloc(void *drng)
++{
++ struct chacha20_state *chacha20_state = (struct chacha20_state *)drng;
++
++ if (drng == &chacha20) {
++ memzero_explicit(chacha20_state, sizeof(*chacha20_state));
++ pr_debug("static ChaCha20 core zeroized\n");
++ return;
++ }
++
++ pr_debug("ChaCha20 core zeroized and freed\n");
++ kfree_sensitive(chacha20_state);
++}
++
++/******************************* Hash Operation *******************************/
++
++#ifdef CONFIG_CRYPTO_LIB_SHA256
++
++#include
++
++static u32 lrng_cc20_hash_digestsize(void *hash)
++{
++ return SHA256_DIGEST_SIZE;
++}
++
++static int lrng_cc20_hash_init(struct shash_desc *shash, void *hash)
++{
++ /*
++ * We do not need a TFM - we only need sufficient space for
++ * struct sha256_state on the stack.
++ */
++ sha256_init(shash_desc_ctx(shash));
++ return 0;
++}
++
++static int lrng_cc20_hash_update(struct shash_desc *shash,
++ const u8 *inbuf, u32 inbuflen)
++{
++ sha256_update(shash_desc_ctx(shash), inbuf, inbuflen);
++ return 0;
++}
++
++static int lrng_cc20_hash_final(struct shash_desc *shash, u8 *digest)
++{
++ sha256_final(shash_desc_ctx(shash), digest);
++ return 0;
++}
++
++static const char *lrng_cc20_hash_name(void)
++{
++ return "SHA-256";
++}
++
++static void lrng_cc20_hash_desc_zero(struct shash_desc *shash)
++{
++ memzero_explicit(shash_desc_ctx(shash), sizeof(struct sha256_state));
++}
++
++#else /* CONFIG_CRYPTO_LIB_SHA256 */
++
++#include
++#include
++
++/*
++ * If the SHA-256 support is not compiled, we fall back to SHA-1 that is always
++ * compiled and present in the kernel.
++ */
++static u32 lrng_cc20_hash_digestsize(void *hash)
++{
++ return SHA1_DIGEST_SIZE;
++}
++
++static void lrng_sha1_block_fn(struct sha1_state *sctx, const u8 *src,
++ int blocks)
++{
++ u32 temp[SHA1_WORKSPACE_WORDS];
++
++ while (blocks--) {
++ sha1_transform(sctx->state, src, temp);
++ src += SHA1_BLOCK_SIZE;
++ }
++ memzero_explicit(temp, sizeof(temp));
++}
++
++static int lrng_cc20_hash_init(struct shash_desc *shash, void *hash)
++{
++ /*
++ * We do not need a TFM - we only need sufficient space for
++ * struct sha1_state on the stack.
++ */
++ sha1_base_init(shash);
++ return 0;
++}
++
++static int lrng_cc20_hash_update(struct shash_desc *shash,
++ const u8 *inbuf, u32 inbuflen)
++{
++ return sha1_base_do_update(shash, inbuf, inbuflen, lrng_sha1_block_fn);
++}
++
++static int lrng_cc20_hash_final(struct shash_desc *shash, u8 *digest)
++{
++ return sha1_base_do_finalize(shash, lrng_sha1_block_fn) ?:
++ sha1_base_finish(shash, digest);
++}
++
++static const char *lrng_cc20_hash_name(void)
++{
++ return "SHA-1";
++}
++
++static void lrng_cc20_hash_desc_zero(struct shash_desc *shash)
++{
++ memzero_explicit(shash_desc_ctx(shash), sizeof(struct sha1_state));
++}
++
++#endif /* CONFIG_CRYPTO_LIB_SHA256 */
++
++static void *lrng_cc20_hash_alloc(void)
++{
++ pr_info("Hash %s allocated\n", lrng_cc20_hash_name());
++ return NULL;
++}
++
++static void lrng_cc20_hash_dealloc(void *hash)
++{
++}
++
++static const char *lrng_cc20_drng_name(void)
++{
++ return "ChaCha20 DRNG";
++}
++
++const struct lrng_crypto_cb lrng_cc20_crypto_cb = {
++ .lrng_drng_name = lrng_cc20_drng_name,
++ .lrng_hash_name = lrng_cc20_hash_name,
++ .lrng_drng_alloc = lrng_cc20_drng_alloc,
++ .lrng_drng_dealloc = lrng_cc20_drng_dealloc,
++ .lrng_drng_seed_helper = lrng_cc20_drng_seed_helper,
++ .lrng_drng_generate_helper = lrng_cc20_drng_generate_helper,
++ .lrng_hash_alloc = lrng_cc20_hash_alloc,
++ .lrng_hash_dealloc = lrng_cc20_hash_dealloc,
++ .lrng_hash_digestsize = lrng_cc20_hash_digestsize,
++ .lrng_hash_init = lrng_cc20_hash_init,
++ .lrng_hash_update = lrng_cc20_hash_update,
++ .lrng_hash_final = lrng_cc20_hash_final,
++ .lrng_hash_desc_zero = lrng_cc20_hash_desc_zero,
++};
+--- /dev/null
++++ b/drivers/char/lrng/lrng_chacha20.h
+@@ -0,0 +1,29 @@
++/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
++/*
++ * LRNG ChaCha20 definitions
++ *
++ * Copyright (C) 2016 - 2021, Stephan Mueller
++ */
++
++#include
++
++/* State according to RFC 7539 section 2.3 */
++struct chacha20_block {
++ u32 constants[4];
++ union {
++#define CHACHA_KEY_SIZE_WORDS (CHACHA_KEY_SIZE / sizeof(u32))
++ u32 u[CHACHA_KEY_SIZE_WORDS];
++ u8 b[CHACHA_KEY_SIZE];
++ } key;
++ u32 counter;
++ u32 nonce[3];
++};
++
++static inline void lrng_cc20_init_rfc7539(struct chacha20_block *chacha20)
++{
++ /* String "expand 32-byte k" */
++ chacha20->constants[0] = 0x61707865;
++ chacha20->constants[1] = 0x3320646e;
++ chacha20->constants[2] = 0x79622d32;
++ chacha20->constants[3] = 0x6b206574;
++}
+--- /dev/null
++++ b/drivers/char/lrng/lrng_drng.c
+@@ -0,0 +1,422 @@
++// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
++/*
++ * LRNG DRNG processing
++ *
++ * Copyright (C) 2016 - 2021, Stephan Mueller
++ */
++
++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
++
++#include
++#include
++
++#include "lrng_internal.h"
++
++/*
++ * Maximum number of seconds between DRNG reseed intervals of the DRNG. Note,
++ * this is enforced with the next request of random numbers from the
++ * DRNG. Setting this value to zero implies a reseeding attempt before every
++ * generated random number.
++ */
++int lrng_drng_reseed_max_time = 600;
++
++static atomic_t lrng_avail = ATOMIC_INIT(0);
++
++DEFINE_MUTEX(lrng_crypto_cb_update);
++
++/* DRNG for /dev/urandom, getrandom(2), get_random_bytes */
++static struct lrng_drng lrng_drng_init = {
++ .drng = &chacha20,
++ .crypto_cb = &lrng_cc20_crypto_cb,
++ .lock = __MUTEX_INITIALIZER(lrng_drng_init.lock),
++ .spin_lock = __SPIN_LOCK_UNLOCKED(lrng_drng_init.spin_lock),
++ .hash_lock = __RW_LOCK_UNLOCKED(lrng_drng_init.hash_lock)
++};
++
++/*
++ * DRNG for get_random_bytes when called in atomic context. This
++ * DRNG will always use the ChaCha20 DRNG. It will never benefit from a
++ * DRNG switch like the "regular" DRNG. If there was no DRNG switch, the atomic
++ * DRNG is identical to the "regular" DRNG.
++ *
++ * The reason for having this is due to the fact that DRNGs other than
++ * the ChaCha20 DRNG may sleep.
++ */
++static struct lrng_drng lrng_drng_atomic = {
++ .drng = &chacha20,
++ .crypto_cb = &lrng_cc20_crypto_cb,
++ .spin_lock = __SPIN_LOCK_UNLOCKED(lrng_drng_atomic.spin_lock),
++ .hash_lock = __RW_LOCK_UNLOCKED(lrng_drng_atomic.hash_lock)
++};
++
++/********************************** Helper ************************************/
++
++bool lrng_get_available(void)
++{
++ return likely(atomic_read(&lrng_avail));
++}
++
++void lrng_set_available(void)
++{
++ atomic_set(&lrng_avail, 1);
++}
++
++struct lrng_drng *lrng_drng_init_instance(void)
++{
++ return &lrng_drng_init;
++}
++
++struct lrng_drng *lrng_drng_atomic_instance(void)
++{
++ return &lrng_drng_atomic;
++}
++
++void lrng_drng_reset(struct lrng_drng *drng)
++{
++ atomic_set(&drng->requests, LRNG_DRNG_RESEED_THRESH);
++ drng->last_seeded = jiffies;
++ drng->fully_seeded = false;
++ drng->force_reseed = true;
++ pr_debug("reset DRNG\n");
++}
++
++/* Initialize the default DRNG during boot */
++static void lrng_drng_seed(struct lrng_drng *drng);
++void lrng_drngs_init_cc20(bool force_seed)
++{
++ unsigned long flags = 0;
++
++ if (lrng_get_available())
++ return;
++
++ lrng_drng_lock(&lrng_drng_init, &flags);
++ if (lrng_get_available()) {
++ lrng_drng_unlock(&lrng_drng_init, &flags);
++ if (force_seed)
++ goto seed;
++ return;
++ }
++
++ lrng_drng_reset(&lrng_drng_init);
++ lrng_cc20_init_state(&chacha20);
++ lrng_drng_unlock(&lrng_drng_init, &flags);
++
++ lrng_drng_lock(&lrng_drng_atomic, &flags);
++ lrng_drng_reset(&lrng_drng_atomic);
++ /*
++ * We do not initialize the state of the atomic DRNG as it is identical
++ * to the DRNG at this point.
++ */
++ lrng_drng_unlock(&lrng_drng_atomic, &flags);
++
++ lrng_set_available();
++
++seed:
++ /* Seed the DRNG with any entropy available */
++ if (!lrng_pool_trylock()) {
++ lrng_drng_seed(&lrng_drng_init);
++ pr_info("ChaCha20 core initialized with first seeding\n");
++ lrng_pool_unlock();
++ } else {
++ pr_info("ChaCha20 core initialized without seeding\n");
++ }
++}
++
++bool lrng_sp80090c_compliant(void)
++{
++ if (!IS_ENABLED(CONFIG_LRNG_OVERSAMPLE_ENTROPY_SOURCES))
++ return false;
++
++ /* Entropy source hash must be capable of transporting enough entropy */
++ if (lrng_get_digestsize() <
++ (lrng_security_strength() + CONFIG_LRNG_SEED_BUFFER_INIT_ADD_BITS))
++ return false;
++
++ /* SP800-90C only requested in FIPS mode */
++ return fips_enabled;
++}
++
++/************************* Random Number Generation ***************************/
++
++/* Inject a data buffer into the DRNG */
++static void lrng_drng_inject(struct lrng_drng *drng,
++ const u8 *inbuf, u32 inbuflen)
++{
++ const char *drng_type = unlikely(drng == &lrng_drng_atomic) ?
++ "atomic" : "regular";
++ unsigned long flags = 0;
++
++ BUILD_BUG_ON(LRNG_DRNG_RESEED_THRESH > INT_MAX);
++ pr_debug("seeding %s DRNG with %u bytes\n", drng_type, inbuflen);
++ lrng_drng_lock(drng, &flags);
++ if (drng->crypto_cb->lrng_drng_seed_helper(drng->drng,
++ inbuf, inbuflen) < 0) {
++ pr_warn("seeding of %s DRNG failed\n", drng_type);
++ atomic_set(&drng->requests, 1);
++ } else {
++ pr_debug("%s DRNG stats since last seeding: %lu secs; generate calls: %d\n",
++ drng_type,
++ (time_after(jiffies, drng->last_seeded) ?
++ (jiffies - drng->last_seeded) : 0) / HZ,
++ (LRNG_DRNG_RESEED_THRESH -
++ atomic_read(&drng->requests)));
++ drng->last_seeded = jiffies;
++ atomic_set(&drng->requests, LRNG_DRNG_RESEED_THRESH);
++ drng->force_reseed = false;
++
++ if (drng->drng == lrng_drng_atomic.drng) {
++ lrng_drng_atomic.last_seeded = jiffies;
++ atomic_set(&lrng_drng_atomic.requests,
++ LRNG_DRNG_RESEED_THRESH);
++ lrng_drng_atomic.force_reseed = false;
++ }
++ }
++ lrng_drng_unlock(drng, &flags);
++}
++
++/*
++ * Perform the seeding of the DRNG with data from noise source
++ */
++static inline void _lrng_drng_seed(struct lrng_drng *drng)
++{
++ struct entropy_buf seedbuf __aligned(LRNG_KCAPI_ALIGN);
++
++ lrng_fill_seed_buffer(&seedbuf, lrng_get_seed_entropy_osr());
++ lrng_init_ops(&seedbuf);
++ lrng_drng_inject(drng, (u8 *)&seedbuf, sizeof(seedbuf));
++
++ if (!drng->fully_seeded) {
++ drng->fully_seeded = lrng_fully_seeded(&seedbuf);
++ if (drng->fully_seeded)
++ pr_debug("DRNG fully seeded\n");
++ }
++ memzero_explicit(&seedbuf, sizeof(seedbuf));
++}
++
++static int lrng_drng_get(struct lrng_drng *drng, u8 *outbuf, u32 outbuflen);
++static void lrng_drng_seed(struct lrng_drng *drng)
++{
++ _lrng_drng_seed(drng);
++
++ BUILD_BUG_ON(LRNG_MIN_SEED_ENTROPY_BITS >
++ LRNG_DRNG_SECURITY_STRENGTH_BITS);
++
++ /*
++ * Reseed atomic DRNG from current DRNG,
++ *
++ * We can obtain random numbers from DRNG as the lock type
++ * chosen by lrng_drng_get is usable with the current caller.
++ */
++ if ((drng->drng != lrng_drng_atomic.drng) &&
++ (lrng_drng_atomic.force_reseed ||
++ atomic_read(&lrng_drng_atomic.requests) <= 0 ||
++ time_after(jiffies, lrng_drng_atomic.last_seeded +
++ lrng_drng_reseed_max_time * HZ))) {
++ u8 seedbuf[LRNG_DRNG_SECURITY_STRENGTH_BYTES]
++ __aligned(LRNG_KCAPI_ALIGN);
++ int ret = lrng_drng_get(drng, seedbuf, sizeof(seedbuf));
++
++ if (ret < 0) {
++ pr_warn("Error generating random numbers for atomic DRNG: %d\n",
++ ret);
++ } else {
++ lrng_drng_inject(&lrng_drng_atomic, seedbuf, ret);
++ }
++ memzero_explicit(&seedbuf, sizeof(seedbuf));
++ }
++}
++
++static inline void _lrng_drng_seed_work(struct lrng_drng *drng, u32 node)
++{
++ pr_debug("reseed triggered by interrupt noise source for DRNG on NUMA node %d\n",
++ node);
++ lrng_drng_seed(drng);
++ if (drng->fully_seeded) {
++ /* Prevent reseed storm */
++ drng->last_seeded += node * 100 * HZ;
++ /* Prevent draining of pool on idle systems */
++ lrng_drng_reseed_max_time += 100;
++ }
++}
++
++/*
++ * DRNG reseed trigger: Kernel thread handler triggered by the schedule_work()
++ */
++void lrng_drng_seed_work(struct work_struct *dummy)
++{
++ struct lrng_drng **lrng_drng = lrng_drng_instances();
++ u32 node;
++
++ if (lrng_drng) {
++ for_each_online_node(node) {
++ struct lrng_drng *drng = lrng_drng[node];
++
++ if (drng && !drng->fully_seeded) {
++ _lrng_drng_seed_work(drng, node);
++ goto out;
++ }
++ }
++ } else {
++ if (!lrng_drng_init.fully_seeded) {
++ _lrng_drng_seed_work(&lrng_drng_init, 0);
++ goto out;
++ }
++ }
++
++ lrng_pool_all_numa_nodes_seeded(true);
++
++out:
++ /* Allow the seeding operation to be called again */
++ lrng_pool_unlock();
++}
++
++/* Force all DRNGs to reseed before next generation */
++void lrng_drng_force_reseed(void)
++{
++ struct lrng_drng **lrng_drng = lrng_drng_instances();
++ u32 node;
++
++ if (!lrng_drng) {
++ lrng_drng_init.force_reseed = lrng_drng_init.fully_seeded;
++ pr_debug("force reseed of initial DRNG\n");
++ return;
++ }
++ for_each_online_node(node) {
++ struct lrng_drng *drng = lrng_drng[node];
++
++ if (!drng)
++ continue;
++
++ drng->force_reseed = drng->fully_seeded;
++ pr_debug("force reseed of DRNG on node %u\n", node);
++ }
++ lrng_drng_atomic.force_reseed = lrng_drng_atomic.fully_seeded;
++}
++
++/**
++ * lrng_drng_get() - Get random data out of the DRNG which is reseeded
++ * frequently.
++ *
++ * @outbuf: buffer for storing random data
++ * @outbuflen: length of outbuf
++ *
++ * Return:
++ * * < 0 in error case (DRNG generation or update failed)
++ * * >=0 returning the returned number of bytes
++ */
++static int lrng_drng_get(struct lrng_drng *drng, u8 *outbuf, u32 outbuflen)
++{
++ unsigned long flags = 0;
++ u32 processed = 0;
++
++ if (!outbuf || !outbuflen)
++ return 0;
++
++ outbuflen = min_t(size_t, outbuflen, INT_MAX);
++
++ lrng_drngs_init_cc20(false);
++
++ while (outbuflen) {
++ u32 todo = min_t(u32, outbuflen, LRNG_DRNG_MAX_REQSIZE);
++ int ret;
++
++ /* All but the atomic DRNG are seeded during generation */
++ if (atomic_dec_and_test(&drng->requests) ||
++ drng->force_reseed ||
++ time_after(jiffies, drng->last_seeded +
++ lrng_drng_reseed_max_time * HZ)) {
++ if (likely(drng != &lrng_drng_atomic)) {
++ if (lrng_pool_trylock()) {
++ atomic_set(&drng->requests, 1);
++ } else {
++ lrng_drng_seed(drng);
++ lrng_pool_unlock();
++ }
++ }
++ }
++
++ lrng_drng_lock(drng, &flags);
++ ret = drng->crypto_cb->lrng_drng_generate_helper(
++ drng->drng, outbuf + processed, todo);
++ lrng_drng_unlock(drng, &flags);
++ if (ret <= 0) {
++ pr_warn("getting random data from DRNG failed (%d)\n",
++ ret);
++ return -EFAULT;
++ }
++ processed += ret;
++ outbuflen -= ret;
++ }
++
++ return processed;
++}
++
++int lrng_drng_get_atomic(u8 *outbuf, u32 outbuflen)
++{
++ return lrng_drng_get(&lrng_drng_atomic, outbuf, outbuflen);
++}
++
++int lrng_drng_get_sleep(u8 *outbuf, u32 outbuflen)
++{
++ struct lrng_drng **lrng_drng = lrng_drng_instances();
++ struct lrng_drng *drng = &lrng_drng_init;
++ int node = numa_node_id();
++
++ might_sleep();
++
++ if (lrng_drng && lrng_drng[node] && lrng_drng[node]->fully_seeded)
++ drng = lrng_drng[node];
++
++ return lrng_drng_get(drng, outbuf, outbuflen);
++}
++
++/* Reset LRNG such that all existing entropy is gone */
++static void _lrng_reset(struct work_struct *work)
++{
++ struct lrng_drng **lrng_drng = lrng_drng_instances();
++ unsigned long flags = 0;
++
++ if (!lrng_drng) {
++ lrng_drng_lock(&lrng_drng_init, &flags);
++ lrng_drng_reset(&lrng_drng_init);
++ lrng_drng_unlock(&lrng_drng_init, &flags);
++ } else {
++ u32 node;
++
++ for_each_online_node(node) {
++ struct lrng_drng *drng = lrng_drng[node];
++
++ if (!drng)
++ continue;
++ lrng_drng_lock(drng, &flags);
++ lrng_drng_reset(drng);
++ lrng_drng_unlock(drng, &flags);
++ }
++ }
++ lrng_set_entropy_thresh(LRNG_INIT_ENTROPY_BITS);
++
++ lrng_reset_state();
++}
++
++static DECLARE_WORK(lrng_reset_work, _lrng_reset);
++
++void lrng_reset(void)
++{
++ schedule_work(&lrng_reset_work);
++}
++
++/***************************** Initialize LRNG *******************************/
++
++static int __init lrng_init(void)
++{
++ lrng_drngs_init_cc20(false);
++
++ lrng_drngs_numa_alloc();
++ return 0;
++}
++
++late_initcall(lrng_init);
++
++MODULE_LICENSE("Dual BSD/GPL");
++MODULE_AUTHOR("Stephan Mueller ");
++MODULE_DESCRIPTION("Linux Random Number Generator");
+--- /dev/null
++++ b/drivers/char/lrng/lrng_interfaces.c
+@@ -0,0 +1,650 @@
++// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
++/*
++ * LRNG User and kernel space interfaces
++ *
++ * Copyright (C) 2016 - 2021, Stephan Mueller
++ */
++
++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
++
++#include
++#include
++#include
++#include
++#include
++#include
++#include
++#include
++#include
++#include
++#include
++
++#define CREATE_TRACE_POINTS
++#include
++
++#include "lrng_internal.h"
++
++/*
++ * If the entropy count falls under this number of bits, then we
++ * should wake up processes which are selecting or polling on write
++ * access to /dev/random.
++ */
++u32 lrng_write_wakeup_bits = LRNG_WRITE_WAKEUP_ENTROPY;
++
++static LIST_HEAD(lrng_ready_list);
++static DEFINE_SPINLOCK(lrng_ready_list_lock);
++
++static DECLARE_WAIT_QUEUE_HEAD(lrng_write_wait);
++static DECLARE_WAIT_QUEUE_HEAD(lrng_init_wait);
++static struct fasync_struct *fasync;
++
++struct ctl_table random_table[];
++
++/********************************** Helper ***********************************/
++
++/* Is the DRNG seed level too low? */
++static inline bool lrng_need_entropy(void)
++{
++ return (lrng_avail_aux_entropy() < lrng_write_wakeup_bits);
++}
++
++void lrng_writer_wakeup(void)
++{
++ if (lrng_need_entropy() && wq_has_sleeper(&lrng_write_wait)) {
++ wake_up_interruptible(&lrng_write_wait);
++ kill_fasync(&fasync, SIGIO, POLL_OUT);
++ }
++}
++
++void lrng_init_wakeup(void)
++{
++ wake_up_all(&lrng_init_wait);
++ kill_fasync(&fasync, SIGIO, POLL_IN);
++}
++
++/**
++ * lrng_process_ready_list() - Ping all kernel internal callers waiting until
++ * the DRNG is completely initialized to inform that the DRNG reached that
++ * seed level.
++ *
++ * When the SP800-90B testing is enabled, the ping only happens if the SP800-90B
++ * startup health tests are completed. This implies that kernel internal
++ * callers always have an SP800-90B compliant noise source when being
++ * pinged.
++ */
++void lrng_process_ready_list(void)
++{
++ unsigned long flags;
++ struct random_ready_callback *rdy, *tmp;
++
++ if (!lrng_state_operational())
++ return;
++
++ spin_lock_irqsave(&lrng_ready_list_lock, flags);
++ list_for_each_entry_safe(rdy, tmp, &lrng_ready_list, list) {
++ struct module *owner = rdy->owner;
++
++ list_del_init(&rdy->list);
++ rdy->func(rdy);
++ module_put(owner);
++ }
++ spin_unlock_irqrestore(&lrng_ready_list_lock, flags);
++}
++
++void lrng_debug_report_seedlevel(const char *name)
++{
++#ifdef CONFIG_WARN_ALL_UNSEEDED_RANDOM
++ static void *previous = NULL;
++ void *caller = (void *) _RET_IP_;
++
++ if (READ_ONCE(previous) == caller)
++ return;
++
++ if (!lrng_state_min_seeded())
++ pr_notice("%pS %s called without reaching minimally seeded level (available entropy %u)\n",
++ caller, name, lrng_avail_entropy());
++
++ WRITE_ONCE(previous, caller);
++#endif
++}
++
++/************************ LRNG kernel input interfaces ************************/
++
++/**
++ * add_hwgenerator_randomness() - Interface for in-kernel drivers of true
++ * hardware RNGs.
++ *
++ * Those devices may produce endless random bits and will be throttled
++ * when our pool is full.
++ *
++ * @buffer: buffer holding the entropic data from HW noise sources to be used to
++ * insert into entropy pool.
++ * @count: length of buffer
++ * @entropy_bits: amount of entropy in buffer (value is in bits)
++ */
++void add_hwgenerator_randomness(const char *buffer, size_t count,
++ size_t entropy_bits)
++{
++ /*
++ * Suspend writing if we are fully loaded with entropy.
++ * We'll be woken up again once below lrng_write_wakeup_thresh,
++ * or when the calling thread is about to terminate.
++ */
++ wait_event_interruptible(lrng_write_wait,
++ lrng_need_entropy() ||
++ lrng_state_exseed_allow(lrng_noise_source_hw) ||
++ kthread_should_stop());
++ lrng_state_exseed_set(lrng_noise_source_hw, false);
++ lrng_pool_insert_aux(buffer, count, entropy_bits);
++}
++EXPORT_SYMBOL_GPL(add_hwgenerator_randomness);
++
++/**
++ * add_bootloader_randomness() - Handle random seed passed by bootloader.
++ *
++ * If the seed is trustworthy, it would be regarded as hardware RNGs. Otherwise
++ * it would be regarded as device data.
++ * The decision is controlled by CONFIG_RANDOM_TRUST_BOOTLOADER.
++ *
++ * @buf: buffer holding the entropic data from HW noise sources to be used to
++ * insert into entropy pool.
++ * @size: length of buffer
++ */
++void add_bootloader_randomness(const void *buf, unsigned int size)
++{
++ lrng_pool_insert_aux(buf, size,
++ IS_ENABLED(CONFIG_RANDOM_TRUST_BOOTLOADER) ?
++ size * 8 : 0);
++}
++EXPORT_SYMBOL_GPL(add_bootloader_randomness);
++
++/*
++ * Callback for HID layer -- use the HID event values to stir the entropy pool
++ */
++void add_input_randomness(unsigned int type, unsigned int code,
++ unsigned int value)
++{
++ static unsigned char last_value;
++
++ /* ignore autorepeat and the like */
++ if (value == last_value)
++ return;
++
++ last_value = value;
++
++ lrng_pcpu_array_add_u32((type << 4) ^ code ^ (code >> 4) ^ value);
++}
++EXPORT_SYMBOL_GPL(add_input_randomness);
++
++/**
++ * add_device_randomness() - Add device- or boot-specific data to the entropy
++ * pool to help initialize it.
++ *
++ * None of this adds any entropy; it is meant to avoid the problem of
++ * the entropy pool having similar initial state across largely
++ * identical devices.
++ *
++ * @buf: buffer holding the entropic data from HW noise sources to be used to
++ * insert into entropy pool.
++ * @size: length of buffer
++ */
++void add_device_randomness(const void *buf, unsigned int size)
++{
++ lrng_pool_insert_aux((u8 *)buf, size, 0);
++}
++EXPORT_SYMBOL(add_device_randomness);
++
++#ifdef CONFIG_BLOCK
++void rand_initialize_disk(struct gendisk *disk) { }
++void add_disk_randomness(struct gendisk *disk) { }
++EXPORT_SYMBOL(add_disk_randomness);
++#endif
++
++/**
++ * del_random_ready_callback() - Delete a previously registered readiness
++ * callback function.
++ *
++ * @rdy: callback definition that was registered initially
++ */
++void del_random_ready_callback(struct random_ready_callback *rdy)
++{
++ unsigned long flags;
++ struct module *owner = NULL;
++
++ spin_lock_irqsave(&lrng_ready_list_lock, flags);
++ if (!list_empty(&rdy->list)) {
++ list_del_init(&rdy->list);
++ owner = rdy->owner;
++ }
++ spin_unlock_irqrestore(&lrng_ready_list_lock, flags);
++
++ module_put(owner);
++}
++EXPORT_SYMBOL(del_random_ready_callback);
++
++/**
++ * add_random_ready_callback() - Add a callback function that will be invoked
++ * when the DRNG is fully initialized and seeded.
++ *
++ * @rdy: callback definition to be invoked when the LRNG is seeded
++ *
++ * Return:
++ * * 0 if callback is successfully added
++ * * -EALREADY if pool is already initialised (callback not called)
++ * * -ENOENT if module for callback is not alive
++ */
++int add_random_ready_callback(struct random_ready_callback *rdy)
++{
++ struct module *owner;
++ unsigned long flags;
++ int err = -EALREADY;
++
++ if (likely(lrng_state_operational()))
++ return err;
++
++ owner = rdy->owner;
++ if (!try_module_get(owner))
++ return -ENOENT;
++
++ spin_lock_irqsave(&lrng_ready_list_lock, flags);
++ if (lrng_state_operational())
++ goto out;
++
++ owner = NULL;
++
++ list_add(&rdy->list, &lrng_ready_list);
++ err = 0;
++
++out:
++ spin_unlock_irqrestore(&lrng_ready_list_lock, flags);
++
++ module_put(owner);
++
++ return err;
++}
++EXPORT_SYMBOL(add_random_ready_callback);
++
++/*********************** LRNG kernel output interfaces ************************/
++
++/**
++ * get_random_bytes() - Provider of cryptographic strong random numbers for
++ * kernel-internal usage.
++ *
++ * This function is appropriate for all in-kernel use cases. However,
++ * it will always use the ChaCha20 DRNG.
++ *
++ * @buf: buffer to store the random bytes
++ * @nbytes: size of the buffer
++ */
++void get_random_bytes(void *buf, int nbytes)
++{
++ lrng_drng_get_atomic((u8 *)buf, (u32)nbytes);
++ lrng_debug_report_seedlevel("get_random_bytes");
++}
++EXPORT_SYMBOL(get_random_bytes);
++
++/**
++ * get_random_bytes_full() - Provider of cryptographic strong random numbers
++ * for kernel-internal usage.
++ *
++ * This function is appropriate only for non-atomic use cases as this
++ * function may sleep. Though, it provides access to the full functionality
++ * of LRNG including the switchable DRNG support, that may support other
++ * DRNGs such as the SP800-90A DRBG.
++ *
++ * @buf: buffer to store the random bytes
++ * @nbytes: size of the buffer
++ */
++void get_random_bytes_full(void *buf, int nbytes)
++{
++ lrng_drng_get_sleep((u8 *)buf, (u32)nbytes);
++ lrng_debug_report_seedlevel("get_random_bytes_full");
++}
++EXPORT_SYMBOL(get_random_bytes_full);
++
++/**
++ * wait_for_random_bytes() - Wait for the LRNG to be seeded and thus
++ * guaranteed to supply cryptographically secure random numbers.
++ *
++ * This applies to: the /dev/urandom device, the get_random_bytes function,
++ * and the get_random_{u32,u64,int,long} family of functions. Using any of
++ * these functions without first calling this function forfeits the guarantee
++ * of security.
++ *
++ * Return:
++ * * 0 if the LRNG has been seeded.
++ * * -ERESTARTSYS if the function was interrupted by a signal.
++ */
++int wait_for_random_bytes(void)
++{
++ if (likely(lrng_state_min_seeded()))
++ return 0;
++ return wait_event_interruptible(lrng_init_wait,
++ lrng_state_min_seeded());
++}
++EXPORT_SYMBOL(wait_for_random_bytes);
++
++/**
++ * get_random_bytes_arch() - This function will use the architecture-specific
++ * hardware random number generator if it is available.
++ *
++ * The arch-specific hw RNG will almost certainly be faster than what we can
++ * do in software, but it is impossible to verify that it is implemented
++ * securely (as opposed, to, say, the AES encryption of a sequence number using
++ * a key known by the NSA). So it's useful if we need the speed, but only if
++ * we're willing to trust the hardware manufacturer not to have put in a back
++ * door.
++ *
++ * @buf: buffer allocated by caller to store the random data in
++ * @nbytes: length of outbuf
++ *
++ * Return: number of bytes filled in.
++ */
++int __must_check get_random_bytes_arch(void *buf, int nbytes)
++{
++ u8 *p = buf;
++
++ while (nbytes) {
++ unsigned long v;
++ int chunk = min_t(int, nbytes, sizeof(unsigned long));
++
++ if (!arch_get_random_long(&v))
++ break;
++
++ memcpy(p, &v, chunk);
++ p += chunk;
++ nbytes -= chunk;
++ }
++
++ if (nbytes)
++ lrng_drng_get_atomic((u8 *)p, (u32)nbytes);
++
++ return nbytes;
++}
++EXPORT_SYMBOL(get_random_bytes_arch);
++
++/*
++ * Returns whether or not the LRNG has been seeded.
++ *
++ * Returns: true if the urandom pool has been seeded.
++ * false if the urandom pool has not been seeded.
++ */
++bool rng_is_initialized(void)
++{
++ return lrng_state_operational();
++}
++EXPORT_SYMBOL(rng_is_initialized);
++
++/************************ LRNG user output interfaces *************************/
++
++static ssize_t lrng_read_common(char __user *buf, size_t nbytes)
++{
++ ssize_t ret = 0;
++ u8 tmpbuf[LRNG_DRNG_BLOCKSIZE] __aligned(LRNG_KCAPI_ALIGN);
++ u8 *tmp_large = NULL, *tmp = tmpbuf;
++ u32 tmplen = sizeof(tmpbuf);
++
++ if (nbytes == 0)
++ return 0;
++
++ /*
++ * Satisfy large read requests -- as the common case are smaller
++ * request sizes, such as 16 or 32 bytes, avoid a kmalloc overhead for
++ * those by using the stack variable of tmpbuf.
++ */
++ if (!CONFIG_BASE_SMALL && (nbytes > sizeof(tmpbuf))) {
++ tmplen = min_t(u32, nbytes, LRNG_DRNG_MAX_REQSIZE);
++ tmp_large = kmalloc(tmplen + LRNG_KCAPI_ALIGN, GFP_KERNEL);
++ if (!tmp_large)
++ tmplen = sizeof(tmpbuf);
++ else
++ tmp = PTR_ALIGN(tmp_large, LRNG_KCAPI_ALIGN);
++ }
++
++ while (nbytes) {
++ u32 todo = min_t(u32, nbytes, tmplen);
++ int rc = 0;
++
++ /* Reschedule if we received a large request. */
++ if ((tmp_large) && need_resched()) {
++ if (signal_pending(current)) {
++ if (ret == 0)
++ ret = -ERESTARTSYS;
++ break;
++ }
++ schedule();
++ }
++
++ rc = lrng_drng_get_sleep(tmp, todo);
++ if (rc <= 0) {
++ if (rc < 0)
++ ret = rc;
++ break;
++ }
++ if (copy_to_user(buf, tmp, rc)) {
++ ret = -EFAULT;
++ break;
++ }
++
++ nbytes -= rc;
++ buf += rc;
++ ret += rc;
++ }
++
++ /* Wipe data just returned from memory */
++ if (tmp_large)
++ kfree_sensitive(tmp_large);
++ else
++ memzero_explicit(tmpbuf, sizeof(tmpbuf));
++
++ return ret;
++}
++
++static ssize_t
++lrng_read_common_block(int nonblock, char __user *buf, size_t nbytes)
++{
++ if (nbytes == 0)
++ return 0;
++
++ if (unlikely(!lrng_state_operational())) {
++ int ret;
++
++ if (nonblock)
++ return -EAGAIN;
++
++ ret = wait_event_interruptible(lrng_init_wait,
++ lrng_state_operational());
++ if (unlikely(ret))
++ return ret;
++ }
++
++ return lrng_read_common(buf, nbytes);
++}
++
++static ssize_t lrng_drng_read_block(struct file *file, char __user *buf,
++ size_t nbytes, loff_t *ppos)
++{
++ return lrng_read_common_block(file->f_flags & O_NONBLOCK, buf, nbytes);
++}
++
++static __poll_t lrng_random_poll(struct file *file, poll_table *wait)
++{
++ __poll_t mask;
++
++ poll_wait(file, &lrng_init_wait, wait);
++ poll_wait(file, &lrng_write_wait, wait);
++ mask = 0;
++ if (lrng_state_operational())
++ mask |= EPOLLIN | EPOLLRDNORM;
++ if (lrng_need_entropy() ||
++ lrng_state_exseed_allow(lrng_noise_source_user))
++ mask |= EPOLLOUT | EPOLLWRNORM;
++ return mask;
++}
++
++static ssize_t lrng_drng_write_common(const char __user *buffer, size_t count,
++ u32 entropy_bits)
++{
++ ssize_t ret = 0;
++ u8 buf[64] __aligned(LRNG_KCAPI_ALIGN);
++ const char __user *p = buffer;
++ u32 orig_entropy_bits = entropy_bits;
++
++ if (!lrng_get_available())
++ return -EAGAIN;
++
++ count = min_t(size_t, count, INT_MAX);
++ while (count > 0) {
++ size_t bytes = min_t(size_t, count, sizeof(buf));
++ u32 ent = min_t(u32, bytes<<3, entropy_bits);
++
++ if (copy_from_user(&buf, p, bytes))
++ return -EFAULT;
++ /* Inject data into entropy pool */
++ lrng_pool_insert_aux(buf, bytes, ent);
++
++ count -= bytes;
++ p += bytes;
++ ret += bytes;
++ entropy_bits -= ent;
++
++ cond_resched();
++ }
++
++ /* Force reseed of DRNG during next data request. */
++ if (!orig_entropy_bits)
++ lrng_drng_force_reseed();
++
++ return ret;
++}
++
++static ssize_t lrng_drng_read(struct file *file, char __user *buf,
++ size_t nbytes, loff_t *ppos)
++{
++ if (!lrng_state_min_seeded())
++ pr_notice_ratelimited("%s - use of insufficiently seeded DRNG (%zu bytes read)\n",
++ current->comm, nbytes);
++ else if (!lrng_state_operational())
++ pr_debug_ratelimited("%s - use of not fully seeded DRNG (%zu bytes read)\n",
++ current->comm, nbytes);
++
++ return lrng_read_common(buf, nbytes);
++}
++
++static ssize_t lrng_drng_write(struct file *file, const char __user *buffer,
++ size_t count, loff_t *ppos)
++{
++ return lrng_drng_write_common(buffer, count, 0);
++}
++
++static long lrng_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
++{
++ u32 digestsize_bits;
++ int size, ent_count_bits;
++ int __user *p = (int __user *)arg;
++
++ switch (cmd) {
++ case RNDGETENTCNT:
++ ent_count_bits = lrng_avail_entropy();
++ if (put_user(ent_count_bits, p))
++ return -EFAULT;
++ return 0;
++ case RNDADDTOENTCNT:
++ if (!capable(CAP_SYS_ADMIN))
++ return -EPERM;
++ if (get_user(ent_count_bits, p))
++ return -EFAULT;
++ ent_count_bits = (int)lrng_avail_aux_entropy() + ent_count_bits;
++ if (ent_count_bits < 0)
++ ent_count_bits = 0;
++ digestsize_bits = lrng_get_digestsize();
++ if (ent_count_bits > digestsize_bits)
++ ent_count_bits = digestsize_bits;
++ lrng_pool_set_entropy(ent_count_bits);
++ return 0;
++ case RNDADDENTROPY:
++ if (!capable(CAP_SYS_ADMIN))
++ return -EPERM;
++ if (get_user(ent_count_bits, p++))
++ return -EFAULT;
++ if (ent_count_bits < 0)
++ return -EINVAL;
++ if (get_user(size, p++))
++ return -EFAULT;
++ if (size < 0)
++ return -EINVAL;
++ lrng_state_exseed_set(lrng_noise_source_user, false);
++ /* there cannot be more entropy than data */
++ ent_count_bits = min(ent_count_bits, size<<3);
++ return lrng_drng_write_common((const char __user *)p, size,
++ ent_count_bits);
++ case RNDZAPENTCNT:
++ case RNDCLEARPOOL:
++ /* Clear the entropy pool counter. */
++ if (!capable(CAP_SYS_ADMIN))
++ return -EPERM;
++ lrng_pool_set_entropy(0);
++ return 0;
++ case RNDRESEEDCRNG:
++ /*
++ * We leave the capability check here since it is present
++ * in the upstream's RNG implementation. Yet, user space
++ * can trigger a reseed as easy as writing into /dev/random
++ * or /dev/urandom where no privilege is needed.
++ */
++ if (!capable(CAP_SYS_ADMIN))
++ return -EPERM;
++ /* Force a reseed of all DRNGs */
++ lrng_drng_force_reseed();
++ return 0;
++ default:
++ return -EINVAL;
++ }
++}
++
++static int lrng_fasync(int fd, struct file *filp, int on)
++{
++ return fasync_helper(fd, filp, on, &fasync);
++}
++
++const struct file_operations random_fops = {
++ .read = lrng_drng_read_block,
++ .write = lrng_drng_write,
++ .poll = lrng_random_poll,
++ .unlocked_ioctl = lrng_ioctl,
++ .compat_ioctl = compat_ptr_ioctl,
++ .fasync = lrng_fasync,
++ .llseek = noop_llseek,
++};
++
++const struct file_operations urandom_fops = {
++ .read = lrng_drng_read,
++ .write = lrng_drng_write,
++ .unlocked_ioctl = lrng_ioctl,
++ .compat_ioctl = compat_ptr_ioctl,
++ .fasync = lrng_fasync,
++ .llseek = noop_llseek,
++};
++
++SYSCALL_DEFINE3(getrandom, char __user *, buf, size_t, count,
++ unsigned int, flags)
++{
++ if (flags & ~(GRND_NONBLOCK|GRND_RANDOM|GRND_INSECURE))
++ return -EINVAL;
++
++ /*
++ * Requesting insecure and blocking randomness at the same time makes
++ * no sense.
++ */
++ if ((flags &
++ (GRND_INSECURE|GRND_RANDOM)) == (GRND_INSECURE|GRND_RANDOM))
++ return -EINVAL;
++
++ if (count > INT_MAX)
++ count = INT_MAX;
++
++ if (flags & GRND_INSECURE)
++ return lrng_drng_read(NULL, buf, count, NULL);
++
++ return lrng_read_common_block(flags & GRND_NONBLOCK, buf, count);
++}
+--- /dev/null
++++ b/drivers/char/lrng/lrng_internal.h
+@@ -0,0 +1,416 @@
++/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
++/*
++ * Copyright (C) 2018 - 2021, Stephan Mueller
++ */
++
++#ifndef _LRNG_INTERNAL_H
++#define _LRNG_INTERNAL_H
++
++#include
++#include
++#include
++#include
++#include
++#include
++#include
++
++/*************************** General LRNG parameter ***************************/
++
++/* Security strength of LRNG -- this must match DRNG security strength */
++#define LRNG_DRNG_SECURITY_STRENGTH_BYTES 32
++#define LRNG_DRNG_SECURITY_STRENGTH_BITS (LRNG_DRNG_SECURITY_STRENGTH_BYTES * 8)
++#define LRNG_DRNG_BLOCKSIZE 64 /* Maximum of DRNG block sizes */
++
++/*
++ * SP800-90A defines a maximum request size of 1<<16 bytes. The given value is
++ * considered a safer margin.
++ *
++ * This value is allowed to be changed.
++ */
++#define LRNG_DRNG_MAX_REQSIZE (1<<12)
++
++/*
++ * SP800-90A defines a maximum number of requests between reseeds of 2^48.
++ * The given value is considered a much safer margin, balancing requests for
++ * frequent reseeds with the need to conserve entropy. This value MUST NOT be
++ * larger than INT_MAX because it is used in an atomic_t.
++ *
++ * This value is allowed to be changed.
++ */
++#define LRNG_DRNG_RESEED_THRESH (1<<20)
++
++/*
++ * Number of interrupts to be recorded to assume that DRNG security strength
++ * bits of entropy are received.
++ * Note: a value below the DRNG security strength should not be defined as this
++ * may imply the DRNG can never be fully seeded in case other noise
++ * sources are unavailable.
++ *
++ * This value is allowed to be changed.
++ */
++#define LRNG_IRQ_ENTROPY_BITS CONFIG_LRNG_IRQ_ENTROPY_RATE
++
++/*
++ * Min required seed entropy is 128 bits covering the minimum entropy
++ * requirement of SP800-131A and the German BSI's TR02102.
++ *
++ * This value is allowed to be changed.
++ */
++#define LRNG_FULL_SEED_ENTROPY_BITS LRNG_DRNG_SECURITY_STRENGTH_BITS
++#define LRNG_MIN_SEED_ENTROPY_BITS 128
++#define LRNG_INIT_ENTROPY_BITS 32
++
++/*
++ * Wakeup value
++ *
++ * This value is allowed to be changed but must not be larger than the
++ * digest size of the hash operation used update the aux_pool.
++ */
++#ifdef CONFIG_CRYPTO_LIB_SHA256
++# define LRNG_ATOMIC_DIGEST_SIZE SHA256_DIGEST_SIZE
++#else
++# define LRNG_ATOMIC_DIGEST_SIZE SHA1_DIGEST_SIZE
++#endif
++#define LRNG_WRITE_WAKEUP_ENTROPY LRNG_ATOMIC_DIGEST_SIZE
++
++/*
++ * If the switching support is configured, we must provide support up to
++ * the largest digest size. Without switching support, we know it is only
++ * the built-in digest size.
++ */
++#ifdef CONFIG_LRNG_DRNG_SWITCH
++# define LRNG_MAX_DIGESTSIZE 64
++#else
++# define LRNG_MAX_DIGESTSIZE LRNG_ATOMIC_DIGEST_SIZE
++#endif
++
++/*
++ * Oversampling factor of IRQ events to obtain
++ * LRNG_DRNG_SECURITY_STRENGTH_BYTES. This factor is used when a
++ * high-resolution time stamp is not available. In this case, jiffies and
++ * register contents are used to fill the entropy pool. These noise sources
++ * are much less entropic than the high-resolution timer. The entropy content
++ * is the entropy content assumed with LRNG_IRQ_ENTROPY_BITS divided by
++ * LRNG_IRQ_OVERSAMPLING_FACTOR.
++ *
++ * This value is allowed to be changed.
++ */
++#define LRNG_IRQ_OVERSAMPLING_FACTOR 10
++
++/* Alignmask that is intended to be identical to CRYPTO_MINALIGN */
++#define LRNG_KCAPI_ALIGN ARCH_KMALLOC_MINALIGN
++
++/*
++ * This definition must provide a buffer that is equal to SHASH_DESC_ON_STACK
++ * as it will be casted into a struct shash_desc.
++ */
++#define LRNG_POOL_SIZE (sizeof(struct shash_desc) + HASH_MAX_DESCSIZE)
++
++/************************ Default DRNG implementation *************************/
++
++extern struct chacha20_state chacha20;
++extern const struct lrng_crypto_cb lrng_cc20_crypto_cb;
++void lrng_cc20_init_state(struct chacha20_state *state);
++
++/********************************** /proc *************************************/
++
++static inline void lrng_pool_inc_numa_node(void) { }
++
++/****************************** LRNG interfaces *******************************/
++
++extern u32 lrng_write_wakeup_bits;
++extern int lrng_drng_reseed_max_time;
++
++void lrng_writer_wakeup(void);
++void lrng_init_wakeup(void);
++void lrng_debug_report_seedlevel(const char *name);
++void lrng_process_ready_list(void);
++
++/* External interface to use of the switchable DRBG inside the kernel */
++void get_random_bytes_full(void *buf, int nbytes);
++
++/************************* Jitter RNG Entropy Source **************************/
++
++#ifdef CONFIG_LRNG_JENT
++u32 lrng_get_jent(u8 *outbuf, u32 requested_bits);
++u32 lrng_jent_entropylevel(u32 requested_bits);
++#else /* CONFIG_CRYPTO_JITTERENTROPY */
++static inline u32 lrng_get_jent(u8 *outbuf, u32 requested_bits) { return 0; }
++static inline u32 lrng_jent_entropylevel(u32 requested_bits) { return 0; }
++#endif /* CONFIG_CRYPTO_JITTERENTROPY */
++
++/************************** CPU-based Entropy Source **************************/
++
++static inline u32 lrng_fast_noise_entropylevel(u32 ent_bits, u32 requested_bits)
++{
++ /* Obtain entropy statement */
++ ent_bits = ent_bits * requested_bits / LRNG_DRNG_SECURITY_STRENGTH_BITS;
++ /* Cap entropy to buffer size in bits */
++ ent_bits = min_t(u32, ent_bits, requested_bits);
++ return ent_bits;
++}
++
++u32 lrng_get_arch(u8 *outbuf, u32 requested_bits);
++u32 lrng_archrandom_entropylevel(u32 requested_bits);
++
++/************************** Interrupt Entropy Source **************************/
++
++bool lrng_pcpu_continuous_compression_state(void);
++void lrng_pcpu_reset(void);
++u32 lrng_pcpu_avail_pool_size(void);
++u32 lrng_pcpu_avail_entropy(void);
++int lrng_pcpu_switch_hash(int node,
++ const struct lrng_crypto_cb *new_cb, void *new_hash,
++ const struct lrng_crypto_cb *old_cb);
++u32 lrng_pcpu_pool_hash(u8 *outbuf, u32 requested_bits, bool fully_seeded);
++void lrng_pcpu_array_add_u32(u32 data);
++
++/****************************** DRNG processing *******************************/
++
++/* DRNG state handle */
++struct lrng_drng {
++ void *drng; /* DRNG handle */
++ void *hash; /* Hash handle */
++ const struct lrng_crypto_cb *crypto_cb; /* Crypto callbacks */
++ atomic_t requests; /* Number of DRNG requests */
++ unsigned long last_seeded; /* Last time it was seeded */
++ bool fully_seeded; /* Is DRNG fully seeded? */
++ bool force_reseed; /* Force a reseed */
++
++ /* Lock write operations on DRNG state, DRNG replacement of crypto_cb */
++ struct mutex lock;
++ spinlock_t spin_lock;
++ /* Lock *hash replacement - always take before DRNG lock */
++ rwlock_t hash_lock;
++};
++
++extern struct mutex lrng_crypto_cb_update;
++
++struct lrng_drng *lrng_drng_init_instance(void);
++struct lrng_drng *lrng_drng_atomic_instance(void);
++
++static __always_inline bool lrng_drng_is_atomic(struct lrng_drng *drng)
++{
++ return (drng->drng == lrng_drng_atomic_instance()->drng);
++}
++
++/* Lock the DRNG */
++static __always_inline void lrng_drng_lock(struct lrng_drng *drng,
++ unsigned long *flags)
++ __acquires(&drng->spin_lock)
++{
++ /* Use spin lock in case the atomic DRNG context is used */
++ if (lrng_drng_is_atomic(drng)) {
++ spin_lock_irqsave(&drng->spin_lock, *flags);
++
++ /*
++ * In case a lock transition happened while we were spinning,
++ * catch this case and use the new lock type.
++ */
++ if (!lrng_drng_is_atomic(drng)) {
++ spin_unlock_irqrestore(&drng->spin_lock, *flags);
++ __acquire(&drng->spin_lock);
++ mutex_lock(&drng->lock);
++ }
++ } else {
++ __acquire(&drng->spin_lock);
++ mutex_lock(&drng->lock);
++ }
++}
++
++/* Unlock the DRNG */
++static __always_inline void lrng_drng_unlock(struct lrng_drng *drng,
++ unsigned long *flags)
++ __releases(&drng->spin_lock)
++{
++ if (lrng_drng_is_atomic(drng)) {
++ spin_unlock_irqrestore(&drng->spin_lock, *flags);
++ } else {
++ mutex_unlock(&drng->lock);
++ __release(&drng->spin_lock);
++ }
++}
++
++void lrng_reset(void);
++void lrng_drngs_init_cc20(bool force_seed);
++bool lrng_sp80090c_compliant(void);
++
++static inline u32 lrng_compress_osr(void)
++{
++ return lrng_sp80090c_compliant() ? CONFIG_LRNG_OVERSAMPLE_ES_BITS : 0;
++}
++
++static inline u32 lrng_reduce_by_osr(u32 entropy_bits)
++{
++ u32 osr_bits = lrng_compress_osr();
++ return (entropy_bits >= osr_bits) ? (entropy_bits - osr_bits) : 0;
++}
++
++bool lrng_get_available(void);
++void lrng_set_available(void);
++void lrng_drng_reset(struct lrng_drng *drng);
++int lrng_drng_get_atomic(u8 *outbuf, u32 outbuflen);
++int lrng_drng_get_sleep(u8 *outbuf, u32 outbuflen);
++void lrng_drng_force_reseed(void);
++void lrng_drng_seed_work(struct work_struct *dummy);
++
++static inline struct lrng_drng **lrng_drng_instances(void) { return NULL; }
++static inline void lrng_drngs_numa_alloc(void) { return; }
++
++/************************* Entropy sources management *************************/
++
++enum lrng_external_noise_source {
++ lrng_noise_source_hw,
++ lrng_noise_source_user
++};
++
++u32 lrng_avail_aux_entropy(void);
++u32 lrng_get_digestsize(void);
++
++/* Obtain the security strength of the LRNG in bits */
++static inline u32 lrng_security_strength(void)
++{
++ /*
++ * We use a hash to read the entropy in the entropy pool. According to
++ * SP800-90B table 1, the entropy can be at most the digest size.
++ * Considering this together with the last sentence in section 3.1.5.1.2
++ * the security strength of a (approved) hash is equal to its output
++ * size. On the other hand the entropy cannot be larger than the
++ * security strength of the used DRBG.
++ */
++ return min_t(u32, LRNG_FULL_SEED_ENTROPY_BITS, lrng_get_digestsize());
++}
++
++static inline u32 lrng_get_seed_entropy_osr(void)
++{
++ u32 requested_bits = lrng_security_strength();
++
++ /* Apply oversampling during initialization according to SP800-90C */
++ if (lrng_sp80090c_compliant())
++ requested_bits += CONFIG_LRNG_SEED_BUFFER_INIT_ADD_BITS;
++ return requested_bits;
++}
++
++void lrng_set_entropy_thresh(u32 new);
++u32 lrng_avail_entropy(void);
++void lrng_reset_state(void);
++
++bool lrng_state_exseed_allow(enum lrng_external_noise_source source);
++void lrng_state_exseed_set(enum lrng_external_noise_source source, bool type);
++bool lrng_state_min_seeded(void);
++bool lrng_state_fully_seeded(void);
++bool lrng_state_operational(void);
++
++int lrng_pool_trylock(void);
++void lrng_pool_unlock(void);
++void lrng_pool_all_numa_nodes_seeded(bool set);
++bool lrng_pool_highres_timer(void);
++void lrng_pool_set_entropy(u32 entropy_bits);
++int lrng_aux_switch_hash(const struct lrng_crypto_cb *new_cb, void *new_hash,
++ const struct lrng_crypto_cb *old_cb);
++int lrng_pool_insert_aux(const u8 *inbuf, u32 inbuflen, u32 entropy_bits);
++void lrng_pool_add_entropy(void);
++
++struct entropy_buf {
++ u8 a[LRNG_DRNG_SECURITY_STRENGTH_BYTES +
++ (CONFIG_LRNG_SEED_BUFFER_INIT_ADD_BITS >> 3)];
++ u8 b[LRNG_DRNG_SECURITY_STRENGTH_BYTES +
++ (CONFIG_LRNG_SEED_BUFFER_INIT_ADD_BITS >> 3)];
++ u8 c[LRNG_DRNG_SECURITY_STRENGTH_BYTES +
++ (CONFIG_LRNG_SEED_BUFFER_INIT_ADD_BITS >> 3)];
++ u8 d[LRNG_DRNG_SECURITY_STRENGTH_BYTES +
++ (CONFIG_LRNG_SEED_BUFFER_INIT_ADD_BITS >> 3)];
++ u32 now, a_bits, b_bits, c_bits, d_bits;
++};
++
++bool lrng_fully_seeded(struct entropy_buf *eb);
++void lrng_unset_operational(void);
++void lrng_fill_seed_buffer(struct entropy_buf *entropy_buf, u32 requested_bits);
++void lrng_init_ops(struct entropy_buf *eb);
++
++/************************** Health Test linking code **************************/
++
++enum lrng_health_res {
++ lrng_health_pass, /* Health test passes on time stamp */
++ lrng_health_fail_use, /* Time stamp unhealthy, but mix in */
++ lrng_health_fail_drop /* Time stamp unhealthy, drop it */
++};
++
++#ifdef CONFIG_LRNG_HEALTH_TESTS
++bool lrng_sp80090b_startup_complete(void);
++bool lrng_sp80090b_compliant(void);
++
++enum lrng_health_res lrng_health_test(u32 now_time);
++void lrng_health_disable(void);
++
++#else /* CONFIG_LRNG_HEALTH_TESTS */
++static inline bool lrng_sp80090b_startup_complete(void) { return true; }
++static inline bool lrng_sp80090b_compliant(void) { return false; }
++
++static inline enum lrng_health_res
++lrng_health_test(u32 now_time) { return lrng_health_pass; }
++static inline void lrng_health_disable(void) { }
++#endif /* CONFIG_LRNG_HEALTH_TESTS */
++
++/****************************** Helper code ***********************************/
++
++static inline u32 atomic_read_u32(atomic_t *v)
++{
++ return (u32)atomic_read(v);
++}
++
++/*************************** Auxiliary functions ******************************/
++
++void invalidate_batched_entropy(void);
++
++/***************************** Testing code ***********************************/
++
++#ifdef CONFIG_LRNG_RAW_HIRES_ENTROPY
++bool lrng_raw_hires_entropy_store(u32 value);
++#else /* CONFIG_LRNG_RAW_HIRES_ENTROPY */
++static inline bool lrng_raw_hires_entropy_store(u32 value) { return false; }
++#endif /* CONFIG_LRNG_RAW_HIRES_ENTROPY */
++
++#ifdef CONFIG_LRNG_RAW_JIFFIES_ENTROPY
++bool lrng_raw_jiffies_entropy_store(u32 value);
++#else /* CONFIG_LRNG_RAW_JIFFIES_ENTROPY */
++static inline bool lrng_raw_jiffies_entropy_store(u32 value) { return false; }
++#endif /* CONFIG_LRNG_RAW_JIFFIES_ENTROPY */
++
++#ifdef CONFIG_LRNG_RAW_IRQ_ENTROPY
++bool lrng_raw_irq_entropy_store(u32 value);
++#else /* CONFIG_LRNG_RAW_IRQ_ENTROPY */
++static inline bool lrng_raw_irq_entropy_store(u32 value) { return false; }
++#endif /* CONFIG_LRNG_RAW_IRQ_ENTROPY */
++
++#ifdef CONFIG_LRNG_RAW_IRQFLAGS_ENTROPY
++bool lrng_raw_irqflags_entropy_store(u32 value);
++#else /* CONFIG_LRNG_RAW_IRQFLAGS_ENTROPY */
++static inline bool lrng_raw_irqflags_entropy_store(u32 value) { return false; }
++#endif /* CONFIG_LRNG_RAW_IRQFLAGS_ENTROPY */
++
++#ifdef CONFIG_LRNG_RAW_RETIP_ENTROPY
++bool lrng_raw_retip_entropy_store(u32 value);
++#else /* CONFIG_LRNG_RAW_RETIP_ENTROPY */
++static inline bool lrng_raw_retip_entropy_store(u32 value) { return false; }
++#endif /* CONFIG_LRNG_RAW_RETIP_ENTROPY */
++
++#ifdef CONFIG_LRNG_RAW_REGS_ENTROPY
++bool lrng_raw_regs_entropy_store(u32 value);
++#else /* CONFIG_LRNG_RAW_REGS_ENTROPY */
++static inline bool lrng_raw_regs_entropy_store(u32 value) { return false; }
++#endif /* CONFIG_LRNG_RAW_REGS_ENTROPY */
++
++#ifdef CONFIG_LRNG_RAW_ARRAY
++bool lrng_raw_array_entropy_store(u32 value);
++#else /* CONFIG_LRNG_RAW_ARRAY */
++static inline bool lrng_raw_array_entropy_store(u32 value) { return false; }
++#endif /* CONFIG_LRNG_RAW_ARRAY */
++
++#ifdef CONFIG_LRNG_IRQ_PERF
++bool lrng_perf_time(u32 start);
++#else /* CONFIG_LRNG_IRQ_PERF */
++static inline bool lrng_perf_time(u32 start) { return false; }
++#endif /*CONFIG_LRNG_IRQ_PERF */
++
++#endif /* _LRNG_INTERNAL_H */
+--- /dev/null
++++ b/drivers/char/lrng/lrng_pool.c
+@@ -0,0 +1,622 @@
++// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
++/*
++ * LRNG Entropy sources management
++ * LRNG Slow Entropy Source: Auxiliary entropy pool
++ *
++ * Copyright (C) 2016 - 2021, Stephan Mueller
++ */
++
++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
++
++#include
++#include
++#include
++#include
++#include
++#include
++
++#include "lrng_internal.h"
++#include "lrng_sw_noise.h"
++
++struct lrng_state {
++ bool can_invalidate; /* Can invalidate batched entropy? */
++ bool perform_seedwork; /* Can seed work be performed? */
++ bool lrng_operational; /* Is DRNG operational? */
++ bool lrng_fully_seeded; /* Is DRNG fully seeded? */
++ bool lrng_min_seeded; /* Is DRNG minimally seeded? */
++ bool all_online_numa_node_seeded;/* All NUMA DRNGs seeded? */
++
++ /*
++ * To ensure that external entropy providers cannot dominate the
++ * internal noise sources but yet cannot be dominated by internal
++ * noise sources, the following booleans are intended to allow
++ * external to provide seed once when a DRNG reseed occurs. This
++ * triggering of external noise source is performed even when the
++ * entropy pool has sufficient entropy.
++ */
++ bool lrng_seed_hw; /* Allow HW to provide seed */
++ bool lrng_seed_user; /* Allow user space to provide seed */
++
++ atomic_t boot_entropy_thresh; /* Reseed threshold */
++ atomic_t reseed_in_progress; /* Flag for on executing reseed */
++ struct work_struct lrng_seed_work; /* (re)seed work queue */
++};
++
++/*
++ * This is the auxiliary pool
++ *
++ * The aux pool array is aligned to 8 bytes to comfort the kernel crypto API
++ * cipher implementations of the hash functions used to read the pool: for some
++ * accelerated implementations, we need an alignment to avoid a realignment
++ * which involves memcpy(). The alignment to 8 bytes should satisfy all crypto
++ * implementations.
++ */
++struct lrng_pool {
++ u8 aux_pool[LRNG_POOL_SIZE]; /* Aux pool: digest state */
++ atomic_t aux_entropy_bits;
++ atomic_t digestsize; /* Digest size of used hash */
++ bool initialized; /* Aux pool initialized? */
++
++ /* Serialize read of entropy pool and update of aux pool */
++ spinlock_t lock;
++};
++
++static struct lrng_pool lrng_pool __aligned(LRNG_KCAPI_ALIGN) = {
++ .aux_entropy_bits = ATOMIC_INIT(0),
++ .digestsize = ATOMIC_INIT(LRNG_ATOMIC_DIGEST_SIZE),
++ .initialized = false,
++ .lock = __SPIN_LOCK_UNLOCKED(lrng_pool.lock)
++};
++
++static struct lrng_state lrng_state = {
++ false, false, false, false, false, false, true, true,
++ .boot_entropy_thresh = ATOMIC_INIT(LRNG_INIT_ENTROPY_BITS),
++ .reseed_in_progress = ATOMIC_INIT(0),
++};
++
++/********************************** Helper ***********************************/
++
++/* External entropy provider is allowed to provide seed data */
++bool lrng_state_exseed_allow(enum lrng_external_noise_source source)
++{
++ if (source == lrng_noise_source_hw)
++ return lrng_state.lrng_seed_hw;
++ return lrng_state.lrng_seed_user;
++}
++
++/* Enable / disable external entropy provider to furnish seed */
++void lrng_state_exseed_set(enum lrng_external_noise_source source, bool type)
++{
++ if (source == lrng_noise_source_hw)
++ lrng_state.lrng_seed_hw = type;
++ else
++ lrng_state.lrng_seed_user = type;
++}
++
++static inline void lrng_state_exseed_allow_all(void)
++{
++ lrng_state_exseed_set(lrng_noise_source_hw, true);
++ lrng_state_exseed_set(lrng_noise_source_user, true);
++}
++
++/* Entropy in bits present in aux pool */
++u32 lrng_avail_aux_entropy(void)
++{
++ /* Cap available entropy with max entropy */
++ u32 avail_bits = min_t(u32, lrng_get_digestsize(),
++ atomic_read_u32(&lrng_pool.aux_entropy_bits));
++
++ /* Consider oversampling rate due to aux pool conditioning */
++ return lrng_reduce_by_osr(avail_bits);
++}
++
++/* Set the digest size of the used hash in bytes */
++static inline void lrng_set_digestsize(u32 digestsize)
++{
++ struct lrng_pool *pool = &lrng_pool;
++ u32 ent_bits = atomic_xchg_relaxed(&pool->aux_entropy_bits, 0),
++ old_digestsize = lrng_get_digestsize();
++
++ atomic_set(&lrng_pool.digestsize, digestsize);
++
++ /*
++ * In case the new digest is larger than the old one, cap the available
++ * entropy to the old message digest used to process the existing data.
++ */
++ ent_bits = min_t(u32, ent_bits, old_digestsize);
++ atomic_add(ent_bits, &pool->aux_entropy_bits);
++}
++
++/* Obtain the digest size provided by the used hash in bits */
++u32 lrng_get_digestsize(void)
++{
++ return atomic_read_u32(&lrng_pool.digestsize) << 3;
++}
++
++/*
++ * Reading of the LRNG pool is only allowed by one caller. The reading is
++ * only performed to (re)seed DRNGs. Thus, if this "lock" is already taken,
++ * the reseeding operation is in progress. The caller is not intended to wait
++ * but continue with its other operation.
++ */
++int lrng_pool_trylock(void)
++{
++ return atomic_cmpxchg(&lrng_state.reseed_in_progress, 0, 1);
++}
++
++void lrng_pool_unlock(void)
++{
++ atomic_set(&lrng_state.reseed_in_progress, 0);
++}
++
++/* Set new entropy threshold for reseeding during boot */
++void lrng_set_entropy_thresh(u32 new_entropy_bits)
++{
++ atomic_set(&lrng_state.boot_entropy_thresh, new_entropy_bits);
++}
++
++/*
++ * Reset LRNG state - the entropy counters are reset, but the data that may
++ * or may not have entropy remains in the pools as this data will not hurt.
++ */
++void lrng_reset_state(void)
++{
++ atomic_set(&lrng_pool.aux_entropy_bits, 0);
++ lrng_pcpu_reset();
++ lrng_state.lrng_operational = false;
++ lrng_state.lrng_fully_seeded = false;
++ lrng_state.lrng_min_seeded = false;
++ lrng_state.all_online_numa_node_seeded = false;
++ pr_debug("reset LRNG\n");
++}
++
++/* Set flag that all DRNGs are fully seeded */
++void lrng_pool_all_numa_nodes_seeded(bool set)
++{
++ lrng_state.all_online_numa_node_seeded = set;
++}
++
++/* Return boolean whether LRNG reached minimally seed level */
++bool lrng_state_min_seeded(void)
++{
++ return lrng_state.lrng_min_seeded;
++}
++
++/* Return boolean whether LRNG reached fully seed level */
++bool lrng_state_fully_seeded(void)
++{
++ return lrng_state.lrng_fully_seeded;
++}
++
++/* Return boolean whether LRNG is considered fully operational */
++bool lrng_state_operational(void)
++{
++ return lrng_state.lrng_operational;
++}
++
++/* Policy to check whether entropy buffer contains full seeded entropy */
++bool lrng_fully_seeded(struct entropy_buf *eb)
++{
++ return ((eb->a_bits + eb->b_bits + eb->c_bits + eb->d_bits) >=
++ lrng_get_seed_entropy_osr());
++}
++
++/* Disable the fully seeded and operational mode */
++void lrng_unset_operational(void)
++{
++ lrng_pool_all_numa_nodes_seeded(false);
++ lrng_state.lrng_operational = false;
++ lrng_state.lrng_fully_seeded = false;
++}
++
++/* Policy to enable LRNG operational mode */
++static inline void lrng_set_operational(u32 external_es)
++{
++ if (lrng_state.lrng_fully_seeded &&
++ (lrng_sp80090b_startup_complete() ||
++ (lrng_get_seed_entropy_osr() <= external_es))) {
++ lrng_state.lrng_operational = true;
++ lrng_process_ready_list();
++ lrng_init_wakeup();
++ pr_info("LRNG fully operational\n");
++ }
++}
++
++/* Set entropy content in user-space controllable aux pool */
++void lrng_pool_set_entropy(u32 entropy_bits)
++{
++ atomic_set(&lrng_pool.aux_entropy_bits, entropy_bits);
++}
++
++/* Available entropy in the entire LRNG considering all entropy sources */
++u32 lrng_avail_entropy(void)
++{
++ u32 ent_thresh = lrng_security_strength();
++
++ /*
++ * Apply oversampling during initialization according to SP800-90C as
++ * we request a larger buffer from the ES.
++ */
++ if (lrng_sp80090c_compliant() &&
++ !lrng_state.all_online_numa_node_seeded)
++ ent_thresh += CONFIG_LRNG_SEED_BUFFER_INIT_ADD_BITS;
++
++ return lrng_pcpu_avail_entropy() + lrng_avail_aux_entropy() +
++ lrng_archrandom_entropylevel(ent_thresh) +
++ lrng_jent_entropylevel(ent_thresh);
++}
++
++/**
++ * lrng_init_ops() - Set seed stages of LRNG
++ *
++ * Set the slow noise source reseed trigger threshold. The initial threshold
++ * is set to the minimum data size that can be read from the pool: a word. Upon
++ * reaching this value, the next seed threshold of 128 bits is set followed
++ * by 256 bits.
++ *
++ * @eb: buffer containing the size of entropy currently injected into DRNG
++ */
++void lrng_init_ops(struct entropy_buf *eb)
++{
++ struct lrng_state *state = &lrng_state;
++ u32 requested_bits, seed_bits, external_es;
++
++ if (state->lrng_operational)
++ return;
++
++ requested_bits = lrng_get_seed_entropy_osr();
++
++ /*
++ * Entropy provided by external entropy sources - if they provide
++ * the requested amount of entropy, unblock the interface.
++ */
++ external_es = eb->a_bits + eb->c_bits + eb->d_bits;
++ seed_bits = external_es + eb->b_bits;
++
++ /* DRNG is seeded with full security strength */
++ if (state->lrng_fully_seeded) {
++ lrng_set_operational(external_es);
++ lrng_set_entropy_thresh(requested_bits);
++ } else if (lrng_fully_seeded(eb)) {
++ if (state->can_invalidate)
++ invalidate_batched_entropy();
++
++ state->lrng_fully_seeded = true;
++ lrng_set_operational(external_es);
++ state->lrng_min_seeded = true;
++ pr_info("LRNG fully seeded with %u bits of entropy\n",
++ seed_bits);
++ lrng_set_entropy_thresh(requested_bits);
++ } else if (!state->lrng_min_seeded) {
++
++ /* DRNG is seeded with at least 128 bits of entropy */
++ if (seed_bits >= LRNG_MIN_SEED_ENTROPY_BITS) {
++ if (state->can_invalidate)
++ invalidate_batched_entropy();
++
++ state->lrng_min_seeded = true;
++ pr_info("LRNG minimally seeded with %u bits of entropy\n",
++ seed_bits);
++ lrng_set_entropy_thresh(requested_bits);
++ lrng_init_wakeup();
++
++ /* DRNG is seeded with at least LRNG_INIT_ENTROPY_BITS bits */
++ } else if (seed_bits >= LRNG_INIT_ENTROPY_BITS) {
++ pr_info("LRNG initial entropy level %u bits of entropy\n",
++ seed_bits);
++ lrng_set_entropy_thresh(LRNG_MIN_SEED_ENTROPY_BITS);
++ }
++ }
++}
++
++int __init rand_initialize(void)
++{
++ struct seed {
++ ktime_t time;
++ unsigned long data[(LRNG_MAX_DIGESTSIZE /
++ sizeof(unsigned long))];
++ struct new_utsname utsname;
++ } seed __aligned(LRNG_KCAPI_ALIGN);
++ unsigned int i;
++
++ BUILD_BUG_ON(LRNG_MAX_DIGESTSIZE % sizeof(unsigned long));
++
++ seed.time = ktime_get_real();
++
++ for (i = 0; i < ARRAY_SIZE(seed.data); i++) {
++ if (!arch_get_random_seed_long_early(&(seed.data[i])) &&
++ !arch_get_random_long_early(&seed.data[i]))
++ seed.data[i] = random_get_entropy();
++ }
++ memcpy(&seed.utsname, utsname(), sizeof(*(utsname())));
++
++ lrng_pool_insert_aux((u8 *)&seed, sizeof(seed), 0);
++ memzero_explicit(&seed, sizeof(seed));
++
++ /* Initialize the seed work queue */
++ INIT_WORK(&lrng_state.lrng_seed_work, lrng_drng_seed_work);
++ lrng_state.perform_seedwork = true;
++
++ lrng_drngs_init_cc20(true);
++ invalidate_batched_entropy();
++
++ lrng_state.can_invalidate = true;
++
++ return 0;
++}
++
++/*
++ * Replace old with new hash for auxiliary pool handling
++ *
++ * Assumption: the caller must guarantee that the new_cb is available during the
++ * entire operation (e.g. it must hold the write lock against pointer updating).
++ */
++int lrng_aux_switch_hash(const struct lrng_crypto_cb *new_cb, void *new_hash,
++ const struct lrng_crypto_cb *old_cb)
++{
++ struct lrng_pool *pool = &lrng_pool;
++ struct shash_desc *shash = (struct shash_desc *)pool->aux_pool;
++ u8 digest[LRNG_MAX_DIGESTSIZE];
++ int ret;
++
++ if (!IS_ENABLED(CONFIG_LRNG_DRNG_SWITCH))
++ return -EOPNOTSUPP;
++
++ if (unlikely(!pool->initialized))
++ return 0;
++
++ /* Get the aux pool hash with old digest ... */
++ ret = old_cb->lrng_hash_final(shash, digest) ?:
++ /* ... re-initialize the hash with the new digest ... */
++ new_cb->lrng_hash_init(shash, new_hash) ?:
++ /*
++ * ... feed the old hash into the new state. We may feed
++ * uninitialized memory into the new state, but this is
++ * considered no issue and even good as we have some more
++ * uncertainty here.
++ */
++ new_cb->lrng_hash_update(shash, digest, sizeof(digest));
++ if (!ret) {
++ lrng_set_digestsize(new_cb->lrng_hash_digestsize(new_hash));
++ pr_debug("Re-initialize aux entropy pool with hash %s\n",
++ new_cb->lrng_hash_name());
++ }
++
++ memzero_explicit(digest, sizeof(digest));
++ return ret;
++}
++
++/*
++ * Insert data into auxiliary pool by hashing the input data together with
++ * the auxiliary pool. The message digest is the new state of the auxiliary
++ * pool.
++ */
++static int
++lrng_pool_insert_aux_locked(const u8 *inbuf, u32 inbuflen, u32 entropy_bits)
++{
++ struct lrng_pool *pool = &lrng_pool;
++ struct shash_desc *shash = (struct shash_desc *)pool->aux_pool;
++ struct lrng_drng *drng = lrng_drng_init_instance();
++ const struct lrng_crypto_cb *crypto_cb;
++ unsigned long flags;
++ void *hash;
++ int ret;
++
++ entropy_bits = min_t(u32, entropy_bits, inbuflen << 3);
++
++ read_lock_irqsave(&drng->hash_lock, flags);
++
++ crypto_cb = drng->crypto_cb;
++ hash = drng->hash;
++
++ if (unlikely(!pool->initialized)) {
++ ret = crypto_cb->lrng_hash_init(shash, hash);
++ if (ret)
++ goto out;
++ pool->initialized = true;
++ }
++
++ ret = crypto_cb->lrng_hash_update(shash, inbuf, inbuflen);
++ if (ret)
++ goto out;
++
++ /*
++ * Cap the available entropy to the hash output size compliant to
++ * SP800-90B section 3.1.5.1 table 1.
++ */
++ entropy_bits += atomic_read_u32(&pool->aux_entropy_bits);
++ atomic_set(&pool->aux_entropy_bits,
++ min_t(u32, entropy_bits,
++ crypto_cb->lrng_hash_digestsize(hash) << 3));
++
++out:
++ read_unlock_irqrestore(&drng->hash_lock, flags);
++ return ret;
++}
++
++int lrng_pool_insert_aux(const u8 *inbuf, u32 inbuflen, u32 entropy_bits)
++{
++ struct lrng_pool *pool = &lrng_pool;
++ unsigned long flags;
++ int ret;
++
++ spin_lock_irqsave(&pool->lock, flags);
++ ret = lrng_pool_insert_aux_locked(inbuf, inbuflen, entropy_bits);
++ spin_unlock_irqrestore(&pool->lock, flags);
++
++ lrng_pool_add_entropy();
++
++ return ret;
++}
++
++/* Hot code path during boot - mix data into entropy pool during boot */
++void lrng_pool_add_entropy(void)
++{
++ /*
++ * Once all DRNGs are fully seeded, the interrupt noise
++ * sources will not trigger any reseeding any more.
++ */
++ if (likely(lrng_state.all_online_numa_node_seeded))
++ return;
++
++ /* Only try to reseed if the DRNG is alive. */
++ if (!lrng_get_available())
++ return;
++
++ /* Only trigger the DRNG reseed if we have collected entropy. */
++ if (lrng_avail_entropy() <
++ atomic_read_u32(&lrng_state.boot_entropy_thresh))
++ return;
++
++ /* Ensure that the seeding only occurs once at any given time. */
++ if (lrng_pool_trylock())
++ return;
++
++ /* Seed the DRNG with IRQ noise. */
++ if (lrng_state.perform_seedwork)
++ schedule_work(&lrng_state.lrng_seed_work);
++ else
++ lrng_drng_seed_work(NULL);
++}
++
++/************************* Get data from entropy pool *************************/
++
++/**
++ * Get auxiliary entropy pool and its entropy content for seed buffer.
++ * Caller must hold lrng_pool.pool->lock.
++ * @outbuf: buffer to store data in with size requested_bits
++ * @requested_bits: Requested amount of entropy
++ * @return: amount of entropy in outbuf in bits.
++ */
++static inline u32 lrng_get_aux_pool(u8 *outbuf, u32 requested_bits)
++{
++ struct lrng_pool *pool = &lrng_pool;
++ struct shash_desc *shash = (struct shash_desc *)pool->aux_pool;
++ struct lrng_drng *drng = lrng_drng_init_instance();
++ const struct lrng_crypto_cb *crypto_cb;
++ unsigned long flags;
++ void *hash;
++ u32 collected_ent_bits, returned_ent_bits, unused_bits = 0,
++ digestsize;
++ u8 aux_output[LRNG_MAX_DIGESTSIZE];
++
++ if (unlikely(!pool->initialized))
++ return 0;
++
++ read_lock_irqsave(&drng->hash_lock, flags);
++
++ crypto_cb = drng->crypto_cb;
++ hash = drng->hash;
++ digestsize = crypto_cb->lrng_hash_digestsize(hash);
++
++ /* Ensure that no more than the size of aux_pool can be requested */
++ requested_bits = min_t(u32, requested_bits, (LRNG_MAX_DIGESTSIZE << 3));
++
++ /* Cap entropy with entropy counter from aux pool and the used digest */
++ collected_ent_bits = min_t(u32, digestsize << 3,
++ atomic_xchg_relaxed(&pool->aux_entropy_bits, 0));
++
++ /* We collected too much entropy and put the overflow back */
++ if (collected_ent_bits > (requested_bits + lrng_compress_osr())) {
++ /* Amount of bits we collected too much */
++ unused_bits = collected_ent_bits - requested_bits;
++ /* Put entropy back */
++ atomic_add(unused_bits, &pool->aux_entropy_bits);
++ /* Fix collected entropy */
++ collected_ent_bits = requested_bits;
++ }
++
++ /* Apply oversampling: discount requested oversampling rate */
++ returned_ent_bits = lrng_reduce_by_osr(collected_ent_bits);
++
++ pr_debug("obtained %u bits by collecting %u bits of entropy from aux pool, %u bits of entropy remaining\n",
++ returned_ent_bits, collected_ent_bits, unused_bits);
++
++ /* Get the digest for the aux pool to be returned to the caller ... */
++ if (crypto_cb->lrng_hash_final(shash, aux_output) ||
++ /*
++ * ... and re-initialize the aux state. Do not add the aux pool
++ * digest for backward secrecy as it will be added with the
++ * insertion of the complete seed buffer after it has been filled.
++ */
++ crypto_cb->lrng_hash_init(shash, hash)) {
++ returned_ent_bits = 0;
++ } else {
++ /*
++ * Do not truncate the output size exactly to collected_ent_bits
++ * as the aux pool may contain data that is not credited with
++ * entropy, but we want to use them to stir the DRNG state.
++ */
++ memcpy(outbuf, aux_output, requested_bits >> 3);
++ }
++
++ read_unlock_irqrestore(&drng->hash_lock, flags);
++ memzero_explicit(aux_output, digestsize);
++ return returned_ent_bits;
++}
++
++/* Fill the seed buffer with data from the noise sources */
++void lrng_fill_seed_buffer(struct entropy_buf *entropy_buf, u32 requested_bits)
++{
++ struct lrng_pool *pool = &lrng_pool;
++ struct lrng_state *state = &lrng_state;
++ unsigned long flags;
++ u32 pcpu_request, req_ent = lrng_sp80090c_compliant() ?
++ lrng_security_strength() : LRNG_MIN_SEED_ENTROPY_BITS;
++
++ /* Guarantee that requested bits is a multiple of bytes */
++ BUILD_BUG_ON(LRNG_DRNG_SECURITY_STRENGTH_BITS % 8);
++
++ /* always reseed the DRNG with the current time stamp */
++ entropy_buf->now = random_get_entropy();
++
++ /*
++ * Require at least 128 bits of entropy for any reseed. If the LRNG is
++ * operated SP800-90C compliant we want to comply with SP800-90A section
++ * 9.2 mandating that DRNG is reseeded with the security strength.
++ */
++ if (state->lrng_fully_seeded && (lrng_avail_entropy() < req_ent)) {
++ entropy_buf->a_bits = entropy_buf->b_bits = 0;
++ entropy_buf->c_bits = entropy_buf->d_bits = 0;
++ goto wakeup;
++ }
++
++ /* Ensure aux pool extraction and backtracking op are atomic */
++ spin_lock_irqsave(&pool->lock, flags);
++
++ /* Concatenate the output of the entropy sources. */
++ entropy_buf->a_bits = lrng_get_aux_pool(entropy_buf->a, requested_bits);
++
++ /*
++ * If the aux pool returned entropy, pull respective less from per-CPU
++ * pool, but attempt to at least get LRNG_MIN_SEED_ENTROPY_BITS entropy.
++ */
++ pcpu_request = max_t(u32, requested_bits - entropy_buf->a_bits,
++ LRNG_MIN_SEED_ENTROPY_BITS);
++ entropy_buf->b_bits = lrng_pcpu_pool_hash(entropy_buf->b, pcpu_request,
++ state->lrng_fully_seeded);
++
++ entropy_buf->c_bits = lrng_get_arch(entropy_buf->c, requested_bits);
++ entropy_buf->d_bits = lrng_get_jent(entropy_buf->d, requested_bits);
++
++ /* Mix the extracted data back into pool for backtracking resistance */
++ if (lrng_pool_insert_aux_locked((u8 *)entropy_buf,
++ sizeof(struct entropy_buf), 0))
++ pr_warn("Backtracking resistance operation failed\n");
++
++ spin_unlock_irqrestore(&pool->lock, flags);
++
++ /* allow external entropy provider to provide seed */
++ lrng_state_exseed_allow_all();
++
++wakeup:
++ /*
++ * Shall we wake up user space writers? This location covers
++ * ensures that the user space provider does not dominate the internal
++ * noise sources since in case the first call of this function finds
++ * sufficient entropy in the entropy pool, it will not trigger the
++ * wakeup. This implies that when the next /dev/urandom read happens,
++ * the entropy pool is drained.
++ */
++ lrng_writer_wakeup();
++}
+--- /dev/null
++++ b/drivers/char/lrng/lrng_sw_noise.c
+@@ -0,0 +1,702 @@
++// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
++/*
++ * LRNG Slow Entropy Source: Interrupt data collection
++ *
++ * Copyright (C) 2016 - 2021, Stephan Mueller
++ */
++
++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
++
++#include
++#include
++#include
++#include
++#include
++
++#include "lrng_internal.h"
++#include "lrng_sw_noise.h"
++
++/* Number of interrupts required for LRNG_DRNG_SECURITY_STRENGTH_BITS entropy */
++static u32 lrng_irq_entropy_bits = LRNG_IRQ_ENTROPY_BITS;
++/* Is high-resolution timer present? */
++static bool lrng_irq_highres_timer = false;
++
++static u32 irq_entropy __read_mostly = LRNG_IRQ_ENTROPY_BITS;
++#ifdef CONFIG_LRNG_RUNTIME_ES_CONFIG
++module_param(irq_entropy, uint, 0444);
++MODULE_PARM_DESC(irq_entropy,
++ "How many interrupts must be collected for obtaining 256 bits of entropy\n");
++#endif
++
++/* Per-CPU array holding concatenated entropy events */
++static DEFINE_PER_CPU(u32 [LRNG_DATA_ARRAY_SIZE], lrng_pcpu_array)
++ __aligned(LRNG_KCAPI_ALIGN);
++static DEFINE_PER_CPU(u32, lrng_pcpu_array_ptr) = 0;
++static DEFINE_PER_CPU(atomic_t, lrng_pcpu_array_irqs) = ATOMIC_INIT(0);
++
++/*
++ * The entropy collection is performed by executing the following steps:
++ * 1. fill up the per-CPU array holding the time stamps
++ * 2. once the per-CPU array is full, a compression of the data into
++ * the entropy pool is performed - this happens in interrupt context
++ *
++ * If step 2 is not desired in interrupt context, the following boolean
++ * needs to be set to false. This implies that old entropy data in the
++ * per-CPU array collected since the last DRNG reseed is overwritten with
++ * new entropy data instead of retaining the entropy with the compression
++ * operation.
++ *
++ * Impact on entropy:
++ *
++ * If continuous compression is enabled, the maximum entropy that is collected
++ * per CPU between DRNG reseeds is equal to the digest size of the used hash.
++ *
++ * If continuous compression is disabled, the maximum number of entropy events
++ * that can be collected per CPU is equal to LRNG_DATA_ARRAY_SIZE. This amount
++ * of events is converted into an entropy statement which then represents the
++ * maximum amount of entropy collectible per CPU between DRNG reseeds.
++ */
++static bool lrng_pcpu_continuous_compression __read_mostly =
++ IS_ENABLED(CONFIG_LRNG_ENABLE_CONTINUOUS_COMPRESSION);
++
++#ifdef CONFIG_LRNG_SWITCHABLE_CONTINUOUS_COMPRESSION
++module_param(lrng_pcpu_continuous_compression, bool, 0444);
++MODULE_PARM_DESC(lrng_pcpu_continuous_compression,
++ "Perform entropy compression if per-CPU entropy data array is full\n");
++#endif
++
++/*
++ * Per-CPU entropy pool with compressed entropy event
++ *
++ * The per-CPU entropy pool is defined as the hash state. New data is simply
++ * inserted into the entropy pool by performing a hash update operation.
++ * To read the entropy pool, a hash final must be invoked. However, before
++ * the entropy pool is released again after a hash final, the hash init must
++ * be performed.
++ */
++static DEFINE_PER_CPU(u8 [LRNG_POOL_SIZE], lrng_pcpu_pool)
++ __aligned(LRNG_KCAPI_ALIGN);
++/*
++ * Lock to allow other CPUs to read the pool - as this is only done during
++ * reseed which is infrequent, this lock is hardly contended.
++ */
++static DEFINE_PER_CPU(spinlock_t, lrng_pcpu_lock);
++static DEFINE_PER_CPU(bool, lrng_pcpu_lock_init) = false;
++
++/* Return boolean whether LRNG identified presence of high-resolution timer */
++bool lrng_pool_highres_timer(void)
++{
++ return lrng_irq_highres_timer;
++}
++
++/* Convert entropy in bits into number of IRQs with the same entropy content. */
++static inline u32 lrng_entropy_to_data(u32 entropy_bits)
++{
++ return ((entropy_bits * lrng_irq_entropy_bits) /
++ LRNG_DRNG_SECURITY_STRENGTH_BITS);
++}
++
++/* Convert number of IRQs into entropy value. */
++static inline u32 lrng_data_to_entropy(u32 irqnum)
++{
++ return ((irqnum * LRNG_DRNG_SECURITY_STRENGTH_BITS) /
++ lrng_irq_entropy_bits);
++}
++
++static inline bool lrng_pcpu_pool_online(int cpu)
++{
++ return per_cpu(lrng_pcpu_lock_init, cpu);
++}
++
++bool lrng_pcpu_continuous_compression_state(void)
++{
++ return lrng_pcpu_continuous_compression;
++}
++
++static void lrng_pcpu_check_compression_state(void)
++{
++ /* One pool must hold sufficient entropy for disabled compression */
++ if (!lrng_pcpu_continuous_compression) {
++ u32 max_ent = min_t(u32, lrng_get_digestsize(),
++ lrng_data_to_entropy(LRNG_DATA_NUM_VALUES));
++ if (max_ent < lrng_security_strength()) {
++ pr_warn("Force continuous compression operation to ensure LRNG can hold enough entropy\n");
++ lrng_pcpu_continuous_compression = true;
++ }
++ }
++}
++
++static int __init lrng_init_time_source(void)
++{
++ /* Set a minimum number of interrupts that must be collected */
++ irq_entropy = max_t(u32, LRNG_IRQ_ENTROPY_BITS, irq_entropy);
++
++ if ((random_get_entropy() & LRNG_DATA_SLOTSIZE_MASK) ||
++ (random_get_entropy() & LRNG_DATA_SLOTSIZE_MASK)) {
++ /*
++ * As the highres timer is identified here, previous interrupts
++ * obtained during boot time are treated like a lowres-timer
++ * would have been present.
++ */
++ lrng_irq_highres_timer = true;
++ lrng_irq_entropy_bits = irq_entropy;
++ } else {
++ lrng_health_disable();
++ lrng_irq_highres_timer = false;
++ lrng_irq_entropy_bits = irq_entropy *
++ LRNG_IRQ_OVERSAMPLING_FACTOR;
++ pr_warn("operating without high-resolution timer and applying IRQ oversampling factor %u\n",
++ LRNG_IRQ_OVERSAMPLING_FACTOR);
++ lrng_pcpu_check_compression_state();
++ }
++
++ return 0;
++}
++core_initcall(lrng_init_time_source);
++
++/*
++ * Reset all per-CPU pools - reset entropy estimator but leave the pool data
++ * that may or may not have entropy unchanged.
++ */
++void lrng_pcpu_reset(void)
++{
++ int cpu;
++
++ for_each_online_cpu(cpu)
++ atomic_set(per_cpu_ptr(&lrng_pcpu_array_irqs, cpu), 0);
++}
++
++u32 lrng_pcpu_avail_pool_size(void)
++{
++ u32 max_size = 0, max_pool = lrng_get_digestsize();
++ int cpu;
++
++ if (!lrng_pcpu_continuous_compression)
++ max_pool = min_t(u32, max_pool, LRNG_DATA_NUM_VALUES);
++
++ for_each_online_cpu(cpu) {
++ if (lrng_pcpu_pool_online(cpu))
++ max_size += max_pool;
++ }
++
++ return max_size;
++}
++
++/* Return entropy of unused IRQs present in all per-CPU pools. */
++u32 lrng_pcpu_avail_entropy(void)
++{
++ u32 digestsize_irqs, irq = 0;
++ int cpu;
++
++ /* Obtain the cap of maximum numbers of IRQs we count */
++ digestsize_irqs = lrng_entropy_to_data(lrng_get_digestsize());
++ if (!lrng_pcpu_continuous_compression) {
++ /* Cap to max. number of IRQs the array can hold */
++ digestsize_irqs = min_t(u32, digestsize_irqs,
++ LRNG_DATA_NUM_VALUES);
++ }
++
++ for_each_online_cpu(cpu) {
++ if (!lrng_pcpu_pool_online(cpu))
++ continue;
++ irq += min_t(u32, digestsize_irqs,
++ atomic_read_u32(per_cpu_ptr(&lrng_pcpu_array_irqs,
++ cpu)));
++ }
++
++ /* Consider oversampling rate */
++ return lrng_reduce_by_osr(lrng_data_to_entropy(irq));
++}
++
++/**
++ * Trigger a switch of the hash implementation for the per-CPU pool.
++ *
++ * For each per-CPU pool, obtain the message digest with the old hash
++ * implementation, initialize the per-CPU pool again with the new hash
++ * implementation and inject the message digest into the new state.
++ *
++ * Assumption: the caller must guarantee that the new_cb is available during the
++ * entire operation (e.g. it must hold the lock against pointer updating).
++ */
++int lrng_pcpu_switch_hash(int node,
++ const struct lrng_crypto_cb *new_cb, void *new_hash,
++ const struct lrng_crypto_cb *old_cb)
++{
++ u8 digest[LRNG_MAX_DIGESTSIZE];
++ u32 digestsize_irqs, found_irqs;
++ int ret = 0, cpu;
++
++ if (!IS_ENABLED(CONFIG_LRNG_DRNG_SWITCH))
++ return -EOPNOTSUPP;
++
++ for_each_online_cpu(cpu) {
++ struct shash_desc *pcpu_shash;
++
++ /*
++ * Only switch the per-CPU pools for the current node because
++ * the crypto_cb only applies NUMA-node-wide.
++ */
++ if (cpu_to_node(cpu) != node || !lrng_pcpu_pool_online(cpu))
++ continue;
++
++ pcpu_shash = (struct shash_desc *)per_cpu_ptr(lrng_pcpu_pool,
++ cpu);
++
++ digestsize_irqs = old_cb->lrng_hash_digestsize(pcpu_shash);
++ digestsize_irqs = lrng_entropy_to_data(digestsize_irqs << 3);
++
++ if (pcpu_shash->tfm == new_hash)
++ continue;
++
++ /* Get the per-CPU pool hash with old digest ... */
++ ret = old_cb->lrng_hash_final(pcpu_shash, digest) ?:
++ /* ... re-initialize the hash with the new digest ... */
++ new_cb->lrng_hash_init(pcpu_shash, new_hash) ?:
++ /*
++ * ... feed the old hash into the new state. We may feed
++ * uninitialized memory into the new state, but this is
++ * considered no issue and even good as we have some more
++ * uncertainty here.
++ */
++ new_cb->lrng_hash_update(pcpu_shash, digest,
++ sizeof(digest));
++ if (ret)
++ goto out;
++
++ /*
++ * In case the new digest is larger than the old one, cap
++ * the available entropy to the old message digest used to
++ * process the existing data.
++ */
++ found_irqs = atomic_xchg_relaxed(
++ per_cpu_ptr(&lrng_pcpu_array_irqs, cpu), 0);
++ found_irqs = min_t(u32, found_irqs, digestsize_irqs);
++ atomic_add_return_relaxed(found_irqs,
++ per_cpu_ptr(&lrng_pcpu_array_irqs, cpu));
++
++ pr_debug("Re-initialize per-CPU entropy pool for CPU %d on NUMA node %d with hash %s\n",
++ cpu, node, new_cb->lrng_hash_name());
++ }
++
++out:
++ memzero_explicit(digest, sizeof(digest));
++ return ret;
++}
++
++/*
++ * When reading the per-CPU message digest, make sure we use the crypto
++ * callbacks defined for the NUMA node the per-CPU pool is defined for because
++ * the LRNG crypto switch support is only atomic per NUMA node.
++ */
++static inline u32
++lrng_pcpu_pool_hash_one(const struct lrng_crypto_cb *pcpu_crypto_cb,
++ void *pcpu_hash, int cpu, u8 *digest, u32 *digestsize)
++{
++ struct shash_desc *pcpu_shash =
++ (struct shash_desc *)per_cpu_ptr(lrng_pcpu_pool, cpu);
++ spinlock_t *lock = per_cpu_ptr(&lrng_pcpu_lock, cpu);
++ unsigned long flags;
++ u32 digestsize_irqs, found_irqs;
++
++ /* Lock guarding against reading / writing to per-CPU pool */
++ spin_lock_irqsave(lock, flags);
++
++ *digestsize = pcpu_crypto_cb->lrng_hash_digestsize(pcpu_hash);
++ digestsize_irqs = lrng_entropy_to_data(*digestsize << 3);
++
++ /* Obtain entropy statement like for the entropy pool */
++ found_irqs = atomic_xchg_relaxed(
++ per_cpu_ptr(&lrng_pcpu_array_irqs, cpu), 0);
++ /* Cap to maximum amount of data we can hold in hash */
++ found_irqs = min_t(u32, found_irqs, digestsize_irqs);
++
++ /* Cap to maximum amount of data we can hold in array */
++ if (!lrng_pcpu_continuous_compression)
++ found_irqs = min_t(u32, found_irqs, LRNG_DATA_NUM_VALUES);
++
++ /* Store all not-yet compressed data in data array into hash, ... */
++ if (pcpu_crypto_cb->lrng_hash_update(pcpu_shash,
++ (u8 *)per_cpu_ptr(lrng_pcpu_array, cpu),
++ LRNG_DATA_ARRAY_SIZE * sizeof(u32)) ?:
++ /* ... get the per-CPU pool digest, ... */
++ pcpu_crypto_cb->lrng_hash_final(pcpu_shash, digest) ?:
++ /* ... re-initialize the hash, ... */
++ pcpu_crypto_cb->lrng_hash_init(pcpu_shash, pcpu_hash) ?:
++ /* ... feed the old hash into the new state. */
++ pcpu_crypto_cb->lrng_hash_update(pcpu_shash, digest, *digestsize))
++ found_irqs = 0;
++
++ spin_unlock_irqrestore(lock, flags);
++ return found_irqs;
++}
++
++/**
++ * Hash all per-CPU pools and return the digest to be used as seed data for
++ * seeding a DRNG. The caller must guarantee backtracking resistance.
++ * The function will only copy as much data as entropy is available into the
++ * caller-provided output buffer.
++ *
++ * This function handles the translation from the number of received interrupts
++ * into an entropy statement. The conversion depends on LRNG_IRQ_ENTROPY_BITS
++ * which defines how many interrupts must be received to obtain 256 bits of
++ * entropy. With this value, the function lrng_data_to_entropy converts a given
++ * data size (received interrupts, requested amount of data, etc.) into an
++ * entropy statement. lrng_entropy_to_data does the reverse.
++ *
++ * @outbuf: buffer to store data in with size requested_bits
++ * @requested_bits: Requested amount of entropy
++ * @fully_seeded: indicator whether LRNG is fully seeded
++ * @return: amount of entropy in outbuf in bits.
++ */
++u32 lrng_pcpu_pool_hash(u8 *outbuf, u32 requested_bits, bool fully_seeded)
++{
++ SHASH_DESC_ON_STACK(shash, NULL);
++ const struct lrng_crypto_cb *crypto_cb;
++ struct lrng_drng **lrng_drng = lrng_drng_instances();
++ struct lrng_drng *drng = lrng_drng_init_instance();
++ u8 digest[LRNG_MAX_DIGESTSIZE];
++ unsigned long flags, flags2;
++ u32 found_irqs, collected_irqs = 0, collected_ent_bits, requested_irqs,
++ returned_ent_bits;
++ int ret, cpu;
++ void *hash;
++
++ /* Lock guarding replacement of per-NUMA hash */
++ read_lock_irqsave(&drng->hash_lock, flags);
++
++ crypto_cb = drng->crypto_cb;
++ hash = drng->hash;
++
++ /* The hash state of filled with all per-CPU pool hashes. */
++ ret = crypto_cb->lrng_hash_init(shash, hash);
++ if (ret)
++ goto err;
++
++ requested_irqs = lrng_entropy_to_data(requested_bits) +
++ lrng_compress_osr();
++
++ /*
++ * Harvest entropy from each per-CPU hash state - even though we may
++ * have collected sufficient entropy, we will hash all per-CPU pools.
++ */
++ for_each_online_cpu(cpu) {
++ struct lrng_drng *pcpu_drng = drng;
++ u32 digestsize, pcpu_unused_irqs = 0;
++ int node = cpu_to_node(cpu);
++
++ /* If pool is not online, then no entropy is present. */
++ if (!lrng_pcpu_pool_online(cpu))
++ continue;
++
++ if (lrng_drng && lrng_drng[node])
++ pcpu_drng = lrng_drng[node];
++
++ if (pcpu_drng == drng) {
++ found_irqs = lrng_pcpu_pool_hash_one(crypto_cb, hash,
++ cpu, digest,
++ &digestsize);
++ } else {
++ read_lock_irqsave(&pcpu_drng->hash_lock, flags2);
++ found_irqs =
++ lrng_pcpu_pool_hash_one(pcpu_drng->crypto_cb,
++ pcpu_drng->hash, cpu,
++ digest, &digestsize);
++ read_unlock_irqrestore(&pcpu_drng->hash_lock, flags2);
++ }
++
++ /* Inject the digest into the state of all per-CPU pools */
++ ret = crypto_cb->lrng_hash_update(shash, digest, digestsize);
++ if (ret)
++ goto err;
++
++ collected_irqs += found_irqs;
++ if (collected_irqs > requested_irqs) {
++ pcpu_unused_irqs = collected_irqs - requested_irqs;
++ atomic_add_return_relaxed(pcpu_unused_irqs,
++ per_cpu_ptr(&lrng_pcpu_array_irqs, cpu));
++ collected_irqs = requested_irqs;
++ }
++ pr_debug("%u interrupts used from entropy pool of CPU %d, %u interrupts remain unused\n",
++ found_irqs - pcpu_unused_irqs, cpu, pcpu_unused_irqs);
++ }
++
++ ret = crypto_cb->lrng_hash_final(shash, digest);
++ if (ret)
++ goto err;
++
++ collected_ent_bits = lrng_data_to_entropy(collected_irqs);
++ /* Cap to maximum entropy that can ever be generated with given hash */
++ collected_ent_bits = min_t(u32, collected_ent_bits,
++ crypto_cb->lrng_hash_digestsize(hash) << 3);
++ /* Apply oversampling: discount requested oversampling rate */
++ returned_ent_bits = lrng_reduce_by_osr(collected_ent_bits);
++
++ pr_debug("obtained %u bits by collecting %u bits of entropy from entropy pool noise source\n",
++ returned_ent_bits, collected_ent_bits);
++
++ /*
++ * Truncate to available entropy as implicitly allowed by SP800-90B
++ * section 3.1.5.1.1 table 1 which awards truncated hashes full
++ * entropy.
++ *
++ * During boot time, we read requested_bits data with
++ * returned_ent_bits entropy. In case our conservative entropy
++ * estimate underestimates the available entropy we can transport as
++ * much available entropy as possible.
++ */
++ memcpy(outbuf, digest, fully_seeded ? returned_ent_bits >> 3 :
++ requested_bits >> 3);
++
++out:
++ crypto_cb->lrng_hash_desc_zero(shash);
++ read_unlock_irqrestore(&drng->hash_lock, flags);
++ memzero_explicit(digest, sizeof(digest));
++ return returned_ent_bits;
++
++err:
++ returned_ent_bits = 0;
++ goto out;
++}
++
++/* Compress the lrng_pcpu_array array into lrng_pcpu_pool */
++static inline void lrng_pcpu_array_compress(void)
++{
++ struct shash_desc *shash =
++ (struct shash_desc *)this_cpu_ptr(lrng_pcpu_pool);
++ struct lrng_drng **lrng_drng = lrng_drng_instances();
++ struct lrng_drng *drng = lrng_drng_init_instance();
++ const struct lrng_crypto_cb *crypto_cb;
++ spinlock_t *lock = this_cpu_ptr(&lrng_pcpu_lock);
++ unsigned long flags, flags2;
++ int node = numa_node_id();
++ void *hash;
++ bool init = false;
++
++ /* Get NUMA-node local hash instance */
++ if (lrng_drng && lrng_drng[node])
++ drng = lrng_drng[node];
++
++ read_lock_irqsave(&drng->hash_lock, flags);
++ crypto_cb = drng->crypto_cb;
++ hash = drng->hash;
++
++ if (unlikely(!this_cpu_read(lrng_pcpu_lock_init))) {
++ init = true;
++ spin_lock_init(lock);
++ this_cpu_write(lrng_pcpu_lock_init, true);
++ pr_debug("Initializing per-CPU entropy pool for CPU %d on NUMA node %d with hash %s\n",
++ raw_smp_processor_id(), node,
++ crypto_cb->lrng_hash_name());
++ }
++
++ spin_lock_irqsave(lock, flags2);
++
++ if (unlikely(init) && crypto_cb->lrng_hash_init(shash, hash)) {
++ this_cpu_write(lrng_pcpu_lock_init, false);
++ pr_warn("Initialization of hash failed\n");
++ } else if (lrng_pcpu_continuous_compression) {
++ /* Add entire per-CPU data array content into entropy pool. */
++ if (crypto_cb->lrng_hash_update(shash,
++ (u8 *)this_cpu_ptr(lrng_pcpu_array),
++ LRNG_DATA_ARRAY_SIZE * sizeof(u32)))
++ pr_warn_ratelimited("Hashing of entropy data failed\n");
++ }
++
++ spin_unlock_irqrestore(lock, flags2);
++ read_unlock_irqrestore(&drng->hash_lock, flags);
++}
++
++/* Compress data array into hash */
++static inline void lrng_pcpu_array_to_hash(u32 ptr)
++{
++ u32 *array = this_cpu_ptr(lrng_pcpu_array);
++
++ /*
++ * During boot time the hash operation is triggered more often than
++ * during regular operation.
++ */
++ if (unlikely(!lrng_state_fully_seeded())) {
++ if ((ptr & 31) && (ptr < LRNG_DATA_WORD_MASK))
++ return;
++ } else if (ptr < LRNG_DATA_WORD_MASK) {
++ return;
++ }
++
++ if (lrng_raw_array_entropy_store(*array)) {
++ u32 i;
++
++ /*
++ * If we fed even a part of the array to external analysis, we
++ * mark that the entire array and the per-CPU pool to have no
++ * entropy. This is due to the non-IID property of the data as
++ * we do not fully know whether the existing dependencies
++ * diminish the entropy beyond to what we expect it has.
++ */
++ atomic_set(this_cpu_ptr(&lrng_pcpu_array_irqs), 0);
++
++ for (i = 1; i < LRNG_DATA_ARRAY_SIZE; i++)
++ lrng_raw_array_entropy_store(*(array + i));
++ } else {
++ lrng_pcpu_array_compress();
++ /* Ping pool handler about received entropy */
++ lrng_pool_add_entropy();
++ }
++}
++
++/*
++ * Concatenate full 32 bit word at the end of time array even when current
++ * ptr is not aligned to sizeof(data).
++ */
++static inline void _lrng_pcpu_array_add_u32(u32 data)
++{
++ /* Increment pointer by number of slots taken for input value */
++ u32 pre_ptr, mask, ptr = this_cpu_add_return(lrng_pcpu_array_ptr,
++ LRNG_DATA_SLOTS_PER_UINT);
++ unsigned int pre_array;
++
++ /*
++ * This function injects a unit into the array - guarantee that
++ * array unit size is equal to data type of input data.
++ */
++ BUILD_BUG_ON(LRNG_DATA_ARRAY_MEMBER_BITS != (sizeof(data) << 3));
++
++ /*
++ * The following logic requires at least two units holding
++ * the data as otherwise the pointer would immediately wrap when
++ * injection an u32 word.
++ */
++ BUILD_BUG_ON(LRNG_DATA_NUM_VALUES <= LRNG_DATA_SLOTS_PER_UINT);
++
++ lrng_pcpu_split_u32(&ptr, &pre_ptr, &mask);
++
++ /* MSB of data go into previous unit */
++ pre_array = lrng_data_idx2array(pre_ptr);
++ /* zeroization of slot to ensure the following OR adds the data */
++ this_cpu_and(lrng_pcpu_array[pre_array], ~(0xffffffff &~ mask));
++ this_cpu_or(lrng_pcpu_array[pre_array], data & ~mask);
++
++ /* Invoke compression as we just filled data array completely */
++ if (unlikely(pre_ptr > ptr))
++ lrng_pcpu_array_to_hash(LRNG_DATA_WORD_MASK);
++
++ /* LSB of data go into current unit */
++ this_cpu_write(lrng_pcpu_array[lrng_data_idx2array(ptr)],
++ data & mask);
++
++ if (likely(pre_ptr <= ptr))
++ lrng_pcpu_array_to_hash(ptr);
++}
++
++/* Concatenate a 32-bit word at the end of the per-CPU array */
++void lrng_pcpu_array_add_u32(u32 data)
++{
++ /*
++ * Disregard entropy-less data without continuous compression to
++ * avoid it overwriting data with entropy when array ptr wraps.
++ */
++ if (lrng_pcpu_continuous_compression)
++ _lrng_pcpu_array_add_u32(data);
++}
++
++/* Concatenate data of max LRNG_DATA_SLOTSIZE_MASK at the end of time array */
++static inline void lrng_pcpu_array_add_slot(u32 data)
++{
++ /* Get slot */
++ u32 ptr = this_cpu_inc_return(lrng_pcpu_array_ptr) &
++ LRNG_DATA_WORD_MASK;
++ unsigned int array = lrng_data_idx2array(ptr);
++ unsigned int slot = lrng_data_idx2slot(ptr);
++
++ BUILD_BUG_ON(LRNG_DATA_ARRAY_MEMBER_BITS % LRNG_DATA_SLOTSIZE_BITS);
++ /* Ensure consistency of values */
++ BUILD_BUG_ON(LRNG_DATA_ARRAY_MEMBER_BITS !=
++ sizeof(lrng_pcpu_array[0]) << 3);
++
++ /* zeroization of slot to ensure the following OR adds the data */
++ this_cpu_and(lrng_pcpu_array[array],
++ ~(lrng_data_slot_val(0xffffffff & LRNG_DATA_SLOTSIZE_MASK,
++ slot)));
++ /* Store data into slot */
++ this_cpu_or(lrng_pcpu_array[array], lrng_data_slot_val(data, slot));
++
++ lrng_pcpu_array_to_hash(ptr);
++}
++
++static inline void
++lrng_time_process_common(u32 time, void(*add_time)(u32 data))
++{
++ enum lrng_health_res health_test;
++
++ if (lrng_raw_hires_entropy_store(time))
++ return;
++
++ health_test = lrng_health_test(time);
++ if (health_test > lrng_health_fail_use)
++ return;
++
++ if (health_test == lrng_health_pass)
++ atomic_inc_return(this_cpu_ptr(&lrng_pcpu_array_irqs));
++
++ add_time(time);
++}
++
++/*
++ * Batching up of entropy in per-CPU array before injecting into entropy pool.
++ */
++static inline void lrng_time_process(void)
++{
++ u32 now_time = random_get_entropy();
++
++ if (unlikely(!lrng_state_fully_seeded())) {
++ /* During boot time, we process the full time stamp */
++ lrng_time_process_common(now_time, _lrng_pcpu_array_add_u32);
++ } else {
++ /* Runtime operation */
++ lrng_time_process_common(now_time & LRNG_DATA_SLOTSIZE_MASK,
++ lrng_pcpu_array_add_slot);
++ }
++
++ lrng_perf_time(now_time);
++}
++
++/* Hot code path - Callback for interrupt handler */
++void add_interrupt_randomness(int irq, int irq_flg)
++{
++ if (lrng_pool_highres_timer()) {
++ lrng_time_process();
++ } else {
++ struct pt_regs *regs = get_irq_regs();
++ static atomic_t reg_idx = ATOMIC_INIT(0);
++ u64 ip;
++ u32 tmp;
++
++ if (regs) {
++ u32 *ptr = (u32 *)regs;
++ int reg_ptr = atomic_add_return_relaxed(1, ®_idx);
++ size_t n = (sizeof(struct pt_regs) / sizeof(u32));
++
++ ip = instruction_pointer(regs);
++ tmp = *(ptr + (reg_ptr % n));
++ tmp = lrng_raw_regs_entropy_store(tmp) ? 0 : tmp;
++ _lrng_pcpu_array_add_u32(tmp);
++ } else {
++ ip = _RET_IP_;
++ }
++
++ lrng_time_process();
++
++ /*
++ * The XOR operation combining the different values is not
++ * considered to destroy entropy since the entirety of all
++ * processed values delivers the entropy (and not each
++ * value separately of the other values).
++ */
++ tmp = lrng_raw_jiffies_entropy_store(jiffies) ? 0 : jiffies;
++ tmp ^= lrng_raw_irq_entropy_store(irq) ? 0 : irq;
++ tmp ^= lrng_raw_irqflags_entropy_store(irq_flg) ? 0 : irq_flg;
++ tmp ^= lrng_raw_retip_entropy_store(ip) ? 0 : ip;
++ tmp ^= ip >> 32;
++ _lrng_pcpu_array_add_u32(tmp);
++ }
++}
++EXPORT_SYMBOL(add_interrupt_randomness);
+--- /dev/null
++++ b/drivers/char/lrng/lrng_sw_noise.h
+@@ -0,0 +1,71 @@
++/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
++/*
++ * LRNG Slow Noise Source: Time stamp array handling
++ *
++ * Copyright (C) 2016 - 2021, Stephan Mueller
++ */
++
++/*
++ * To limit the impact on the interrupt handling, the LRNG concatenates
++ * entropic LSB parts of the time stamps in a per-CPU array and only
++ * injects them into the entropy pool when the array is full.
++ */
++
++/* Store multiple integers in one u32 */
++#define LRNG_DATA_SLOTSIZE_BITS (8)
++#define LRNG_DATA_SLOTSIZE_MASK ((1 << LRNG_DATA_SLOTSIZE_BITS) - 1)
++#define LRNG_DATA_ARRAY_MEMBER_BITS (4 << 3) /* ((sizeof(u32)) << 3) */
++#define LRNG_DATA_SLOTS_PER_UINT (LRNG_DATA_ARRAY_MEMBER_BITS / \
++ LRNG_DATA_SLOTSIZE_BITS)
++
++/*
++ * Number of time values to store in the array - in small environments
++ * only one atomic_t variable per CPU is used.
++ */
++#define LRNG_DATA_NUM_VALUES (CONFIG_LRNG_COLLECTION_SIZE)
++/* Mask of LSB of time stamp to store */
++#define LRNG_DATA_WORD_MASK (LRNG_DATA_NUM_VALUES - 1)
++
++#define LRNG_DATA_SLOTS_MASK (LRNG_DATA_SLOTS_PER_UINT - 1)
++#define LRNG_DATA_ARRAY_SIZE (LRNG_DATA_NUM_VALUES / \
++ LRNG_DATA_SLOTS_PER_UINT)
++
++/* Starting bit index of slot */
++static inline unsigned int lrng_data_slot2bitindex(unsigned int slot)
++{
++ return (LRNG_DATA_SLOTSIZE_BITS * slot);
++}
++
++/* Convert index into the array index */
++static inline unsigned int lrng_data_idx2array(unsigned int idx)
++{
++ return idx / LRNG_DATA_SLOTS_PER_UINT;
++}
++
++/* Convert index into the slot of a given array index */
++static inline unsigned int lrng_data_idx2slot(unsigned int idx)
++{
++ return idx & LRNG_DATA_SLOTS_MASK;
++}
++
++/* Convert value into slot value */
++static inline unsigned int lrng_data_slot_val(unsigned int val,
++ unsigned int slot)
++{
++ return val << lrng_data_slot2bitindex(slot);
++}
++
++/*
++ * Return the pointers for the previous and current units to inject a u32 into.
++ * Also return the mask which the u32 word is to be processed.
++ */
++static inline void lrng_pcpu_split_u32(u32 *ptr, u32 *pre_ptr, u32 *mask)
++{
++ /* ptr to previous unit */
++ *pre_ptr = (*ptr - LRNG_DATA_SLOTS_PER_UINT) & LRNG_DATA_WORD_MASK;
++ *ptr &= LRNG_DATA_WORD_MASK;
++
++ /* mask to split data into the two parts for the two units */
++ *mask = ((1 << (*pre_ptr & (LRNG_DATA_SLOTS_PER_UINT - 1)) *
++ LRNG_DATA_SLOTSIZE_BITS)) - 1;
++}
+--- /dev/null
++++ b/include/linux/lrng.h
+@@ -0,0 +1,81 @@
++/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
++/*
++ * Copyright (C) 2018 - 2021, Stephan Mueller
++ */
++
++#ifndef _LRNG_H
++#define _LRNG_H
++
++#include
++#include
++#include
++
++/**
++ * struct lrng_crypto_cb - cryptographic callback functions
++ * @lrng_drng_name Name of DRNG
++ * @lrng_hash_name Name of Hash used for reading entropy pool
++ * @lrng_drng_alloc: Allocate DRNG -- the provided integer should be
++ * used for sanity checks.
++ * return: allocated data structure or PTR_ERR on
++ * error
++ * @lrng_drng_dealloc: Deallocate DRNG
++ * @lrng_drng_seed_helper: Seed the DRNG with data of arbitrary length
++ * drng: is pointer to data structure allocated
++ * with lrng_drng_alloc
++ * return: >= 0 on success, < 0 on error
++ * @lrng_drng_generate_helper: Generate random numbers from the DRNG with
++ * arbitrary length
++ * @lrng_hash_alloc: Allocate the hash for reading the entropy pool
++ * return: allocated data structure (NULL is
++ * success too) or ERR_PTR on error
++ * @lrng_hash_dealloc: Deallocate Hash
++ * @lrng_hash_digestsize: Return the digestsize for the used hash to read
++ * out entropy pool
++ * hash: is pointer to data structure allocated
++ * with lrng_hash_alloc
++ * return: size of digest of hash in bytes
++ * @lrng_hash_init: Initialize hash
++ * hash: is pointer to data structure allocated
++ * with lrng_hash_alloc
++ * return: 0 on success, < 0 on error
++ * @lrng_hash_update: Update hash operation
++ * hash: is pointer to data structure allocated
++ * with lrng_hash_alloc
++ * return: 0 on success, < 0 on error
++ * @lrng_hash_final Final hash operation
++ * hash: is pointer to data structure allocated
++ * with lrng_hash_alloc
++ * return: 0 on success, < 0 on error
++ * @lrng_hash_desc_zero Zeroization of hash state buffer
++ *
++ * Assumptions:
++ *
++ * 1. Hash operation will not sleep
++ * 2. The hash' volatile state information is provided with *shash by caller.
++ */
++struct lrng_crypto_cb {
++ const char *(*lrng_drng_name)(void);
++ const char *(*lrng_hash_name)(void);
++ void *(*lrng_drng_alloc)(u32 sec_strength);
++ void (*lrng_drng_dealloc)(void *drng);
++ int (*lrng_drng_seed_helper)(void *drng, const u8 *inbuf, u32 inbuflen);
++ int (*lrng_drng_generate_helper)(void *drng, u8 *outbuf, u32 outbuflen);
++ void *(*lrng_hash_alloc)(void);
++ void (*lrng_hash_dealloc)(void *hash);
++ u32 (*lrng_hash_digestsize)(void *hash);
++ int (*lrng_hash_init)(struct shash_desc *shash, void *hash);
++ int (*lrng_hash_update)(struct shash_desc *shash, const u8 *inbuf,
++ u32 inbuflen);
++ int (*lrng_hash_final)(struct shash_desc *shash, u8 *digest);
++ void (*lrng_hash_desc_zero)(struct shash_desc *shash);
++};
++
++/* Register cryptographic backend */
++#ifdef CONFIG_LRNG_DRNG_SWITCH
++int lrng_set_drng_cb(const struct lrng_crypto_cb *cb);
++#else /* CONFIG_LRNG_DRNG_SWITCH */
++static inline int
++lrng_set_drng_cb(const struct lrng_crypto_cb *cb) { return -EOPNOTSUPP; }
++#endif /* CONFIG_LRNG_DRNG_SWITCH */
++
++#endif /* _LRNG_H */
diff --git a/PATCH/LRNG/696-05-v41-0002-LRNG-allocate-one-DRNG-instance-per-NUMA-node.patch b/PATCH/LRNG/696-05-v41-0002-LRNG-allocate-one-DRNG-instance-per-NUMA-node.patch
new file mode 100644
index 000000000..9cf20b6b5
--- /dev/null
+++ b/PATCH/LRNG/696-05-v41-0002-LRNG-allocate-one-DRNG-instance-per-NUMA-node.patch
@@ -0,0 +1,201 @@
+From 0a73760d76194fd8fa2aa039d121b28c9d9e200a Mon Sep 17 00:00:00 2001
+From: Stephan Mueller
+Date: Fri, 18 Jun 2021 08:03:15 +0200
+Subject: [PATCH v41 02/13] LRNG - allocate one DRNG instance per NUMA node
+
+In order to improve NUMA-locality when serving getrandom(2) requests,
+allocate one DRNG instance per node.
+
+The DRNG instance that is present right from the start of the kernel is
+reused as the first per-NUMA-node DRNG. For all remaining online NUMA
+nodes a new DRNG instance is allocated.
+
+During boot time, the multiple DRNG instances are seeded sequentially.
+With this, the first DRNG instance (referenced as the initial DRNG
+in the code) is completely seeded with 256 bits of entropy before the
+next DRNG instance is completely seeded.
+
+When random numbers are requested, the NUMA-node-local DRNG is checked
+whether it has been already fully seeded. If this is not the case, the
+initial DRNG is used to serve the request.
+
+CC: Torsten Duwe
+CC: "Eric W. Biederman"
+CC: "Alexander E. Patrakov"
+CC: "Ahmed S. Darwish"
+CC: "Theodore Y. Ts'o"
+CC: Willy Tarreau
+CC: Matthew Garrett
+CC: Vito Caputo
+CC: Andreas Dilger
+CC: Jan Kara
+CC: Ray Strode
+CC: William Jon McCann
+CC: zhangjs
+CC: Andy Lutomirski
+CC: Florian Weimer
+CC: Lennart Poettering
+CC: Nicolai Stange
+CC: Eric Biggers
+CC: Alexander Lobakin
+Reviewed-by: Marcelo Henrique Cerri
+Reviewed-by: Roman Drahtmueller
+Tested-by: Marcelo Henrique Cerri
+Tested-by: Neil Horman
+Signed-off-by: Stephan Mueller
+---
+ drivers/char/lrng/Makefile | 2 +
+ drivers/char/lrng/lrng_internal.h | 5 ++
+ drivers/char/lrng/lrng_numa.c | 122 ++++++++++++++++++++++++++++++
+ 3 files changed, 129 insertions(+)
+ create mode 100644 drivers/char/lrng/lrng_numa.c
+
+--- a/drivers/char/lrng/Makefile
++++ b/drivers/char/lrng/Makefile
+@@ -7,3 +7,5 @@ obj-y += lrng_pool.o lrng_aux.o \
+ lrng_sw_noise.o lrng_archrandom.o \
+ lrng_drng.o lrng_chacha20.o \
+ lrng_interfaces.o
++
++obj-$(CONFIG_NUMA) += lrng_numa.o
+--- a/drivers/char/lrng/lrng_internal.h
++++ b/drivers/char/lrng/lrng_internal.h
+@@ -254,8 +254,13 @@ int lrng_drng_get_sleep(u8 *outbuf, u32
+ void lrng_drng_force_reseed(void);
+ void lrng_drng_seed_work(struct work_struct *dummy);
+
++#ifdef CONFIG_NUMA
++struct lrng_drng **lrng_drng_instances(void);
++void lrng_drngs_numa_alloc(void);
++#else /* CONFIG_NUMA */
+ static inline struct lrng_drng **lrng_drng_instances(void) { return NULL; }
+ static inline void lrng_drngs_numa_alloc(void) { return; }
++#endif /* CONFIG_NUMA */
+
+ /************************* Entropy sources management *************************/
+
+--- /dev/null
++++ b/drivers/char/lrng/lrng_numa.c
+@@ -0,0 +1,122 @@
++// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
++/*
++ * LRNG NUMA support
++ *
++ * Copyright (C) 2016 - 2021, Stephan Mueller
++ */
++
++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
++
++#include
++#include
++
++#include "lrng_internal.h"
++
++static struct lrng_drng **lrng_drng __read_mostly = NULL;
++
++struct lrng_drng **lrng_drng_instances(void)
++{
++ return smp_load_acquire(&lrng_drng);
++}
++
++/* Allocate the data structures for the per-NUMA node DRNGs */
++static void _lrng_drngs_numa_alloc(struct work_struct *work)
++{
++ struct lrng_drng **drngs;
++ struct lrng_drng *lrng_drng_init = lrng_drng_init_instance();
++ u32 node;
++ bool init_drng_used = false;
++
++ mutex_lock(&lrng_crypto_cb_update);
++
++ /* per-NUMA-node DRNGs are already present */
++ if (lrng_drng)
++ goto unlock;
++
++ drngs = kcalloc(nr_node_ids, sizeof(void *), GFP_KERNEL|__GFP_NOFAIL);
++ for_each_online_node(node) {
++ struct lrng_drng *drng;
++
++ if (!init_drng_used) {
++ drngs[node] = lrng_drng_init;
++ init_drng_used = true;
++ continue;
++ }
++
++ drng = kmalloc_node(sizeof(struct lrng_drng),
++ GFP_KERNEL|__GFP_NOFAIL, node);
++ memset(drng, 0, sizeof(lrng_drng));
++
++ drng->crypto_cb = lrng_drng_init->crypto_cb;
++ drng->drng = drng->crypto_cb->lrng_drng_alloc(
++ LRNG_DRNG_SECURITY_STRENGTH_BYTES);
++ if (IS_ERR(drng->drng)) {
++ kfree(drng);
++ goto err;
++ }
++
++ drng->hash = drng->crypto_cb->lrng_hash_alloc();
++ if (IS_ERR(drng->hash)) {
++ drng->crypto_cb->lrng_drng_dealloc(drng->drng);
++ kfree(drng);
++ goto err;
++ }
++
++ mutex_init(&drng->lock);
++ spin_lock_init(&drng->spin_lock);
++ rwlock_init(&drng->hash_lock);
++
++ /*
++ * Switch the hash used by the per-CPU pool.
++ * We do not need to lock the new hash as it is not usable yet
++ * due to **drngs not yet being initialized.
++ */
++ if (lrng_pcpu_switch_hash(node, drng->crypto_cb, drng->hash,
++ &lrng_cc20_crypto_cb))
++ goto err;
++
++ /*
++ * No reseeding of NUMA DRNGs from previous DRNGs as this
++ * would complicate the code. Let it simply reseed.
++ */
++ lrng_drng_reset(drng);
++ drngs[node] = drng;
++
++ lrng_pool_inc_numa_node();
++ pr_info("DRNG and entropy pool read hash for NUMA node %d allocated\n",
++ node);
++ }
++
++ /* counterpart to smp_load_acquire in lrng_drng_instances */
++ if (!cmpxchg_release(&lrng_drng, NULL, drngs)) {
++ lrng_pool_all_numa_nodes_seeded(false);
++ goto unlock;
++ }
++
++err:
++ for_each_online_node(node) {
++ struct lrng_drng *drng = drngs[node];
++
++ if (drng == lrng_drng_init)
++ continue;
++
++ if (drng) {
++ lrng_pcpu_switch_hash(node, &lrng_cc20_crypto_cb, NULL,
++ drng->crypto_cb);
++ drng->crypto_cb->lrng_hash_dealloc(drng->hash);
++ drng->crypto_cb->lrng_drng_dealloc(drng->drng);
++ kfree(drng);
++ }
++ }
++ kfree(drngs);
++
++unlock:
++ mutex_unlock(&lrng_crypto_cb_update);
++}
++
++static DECLARE_WORK(lrng_drngs_numa_alloc_work, _lrng_drngs_numa_alloc);
++
++void lrng_drngs_numa_alloc(void)
++{
++ schedule_work(&lrng_drngs_numa_alloc_work);
++}
diff --git a/PATCH/LRNG/696-06-v41-0003-LRNG-sysctls-and-proc-interface.patch b/PATCH/LRNG/696-06-v41-0003-LRNG-sysctls-and-proc-interface.patch
new file mode 100644
index 000000000..f517737f6
--- /dev/null
+++ b/PATCH/LRNG/696-06-v41-0003-LRNG-sysctls-and-proc-interface.patch
@@ -0,0 +1,287 @@
+From 28ba413daaf3dc24f8efb15335a8da82201e42d7 Mon Sep 17 00:00:00 2001
+From: Stephan Mueller
+Date: Wed, 23 Jun 2021 18:44:26 +0200
+Subject: [PATCH v41 03/13] LRNG - sysctls and /proc interface
+
+The LRNG sysctl interface provides the same controls as the existing
+/dev/random implementation. These sysctls behave identically and are
+implemented identically. The goal is to allow a possible merge of the
+existing /dev/random implementation with this implementation which
+implies that this patch tries have a very close similarity. Yet, all
+sysctls are documented at [1].
+
+In addition, it provides the file lrng_type which provides details about
+the LRNG:
+
+- the name of the DRNG that produces the random numbers for /dev/random,
+/dev/urandom, getrandom(2)
+
+- the hash used to produce random numbers from the entropy pool
+
+- the number of secondary DRNG instances
+
+- indicator whether the LRNG operates SP800-90B compliant
+
+- indicator whether a high-resolution timer is identified - only with a
+high-resolution timer the interrupt noise source will deliver sufficient
+entropy
+
+- indicator whether the LRNG has been minimally seeded (i.e. is the
+secondary DRNG seeded with at least 128 bits of entropy)
+
+- indicator whether the LRNG has been fully seeded (i.e. is the
+secondary DRNG seeded with at least 256 bits of entropy)
+
+[1] https://www.chronox.de/lrng.html
+
+CC: Torsten Duwe
+CC: "Eric W. Biederman"
+CC: "Alexander E. Patrakov"
+CC: "Ahmed S. Darwish"
+CC: "Theodore Y. Ts'o"
+CC: Willy Tarreau
+CC: Matthew Garrett
+CC: Vito Caputo
+CC: Andreas Dilger
+CC: Jan Kara
+CC: Ray Strode
+CC: William Jon McCann
+CC: zhangjs
+CC: Andy Lutomirski
+CC: Florian Weimer
+CC: Lennart Poettering
+CC: Nicolai Stange
+CC: Alexander Lobakin
+Reviewed-by: Marcelo Henrique Cerri
+Reviewed-by: Roman Drahtmueller
+Tested-by: Marcelo Henrique Cerri
+Tested-by: Neil Horman
+Signed-off-by: Stephan Mueller
+---
+ drivers/char/lrng/Makefile | 1 +
+ drivers/char/lrng/lrng_interfaces.c | 2 -
+ drivers/char/lrng/lrng_internal.h | 4 +
+ drivers/char/lrng/lrng_proc.c | 185 ++++++++++++++++++++++++++++
+ 4 files changed, 190 insertions(+), 2 deletions(-)
+ create mode 100644 drivers/char/lrng/lrng_proc.c
+
+--- a/drivers/char/lrng/Makefile
++++ b/drivers/char/lrng/Makefile
+@@ -9,3 +9,4 @@ obj-y += lrng_pool.o lrng_aux.o \
+ lrng_interfaces.o
+
+ obj-$(CONFIG_NUMA) += lrng_numa.o
++obj-$(CONFIG_SYSCTL) += lrng_proc.o
+--- a/drivers/char/lrng/lrng_interfaces.c
++++ b/drivers/char/lrng/lrng_interfaces.c
+@@ -38,8 +38,6 @@ static DECLARE_WAIT_QUEUE_HEAD(lrng_writ
+ static DECLARE_WAIT_QUEUE_HEAD(lrng_init_wait);
+ static struct fasync_struct *fasync;
+
+-struct ctl_table random_table[];
+-
+ /********************************** Helper ***********************************/
+
+ /* Is the DRNG seed level too low? */
+--- a/drivers/char/lrng/lrng_internal.h
++++ b/drivers/char/lrng/lrng_internal.h
+@@ -114,7 +114,11 @@ void lrng_cc20_init_state(struct chacha2
+
+ /********************************** /proc *************************************/
+
++#ifdef CONFIG_SYSCTL
++void lrng_pool_inc_numa_node(void);
++#else
+ static inline void lrng_pool_inc_numa_node(void) { }
++#endif
+
+ /****************************** LRNG interfaces *******************************/
+
+--- /dev/null
++++ b/drivers/char/lrng/lrng_proc.c
+@@ -0,0 +1,185 @@
++// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
++/*
++ * LRNG proc and sysctl interfaces
++ *
++ * Copyright (C) 2016 - 2021, Stephan Mueller
++ */
++
++#include
++#include
++#include
++#include
++#include
++
++#include "lrng_internal.h"
++#include "lrng_sw_noise.h"
++
++/*
++ * This function is used to return both the bootid UUID, and random
++ * UUID. The difference is in whether table->data is NULL; if it is,
++ * then a new UUID is generated and returned to the user.
++ *
++ * If the user accesses this via the proc interface, the UUID will be
++ * returned as an ASCII string in the standard UUID format; if via the
++ * sysctl system call, as 16 bytes of binary data.
++ */
++static int lrng_proc_do_uuid(struct ctl_table *table, int write,
++ void *buffer, size_t *lenp, loff_t *ppos)
++{
++ struct ctl_table fake_table;
++ unsigned char buf[64], tmp_uuid[16], *uuid;
++
++ uuid = table->data;
++ if (!uuid) {
++ uuid = tmp_uuid;
++ generate_random_uuid(uuid);
++ } else {
++ static DEFINE_SPINLOCK(bootid_spinlock);
++
++ spin_lock(&bootid_spinlock);
++ if (!uuid[8])
++ generate_random_uuid(uuid);
++ spin_unlock(&bootid_spinlock);
++ }
++
++ sprintf(buf, "%pU", uuid);
++
++ fake_table.data = buf;
++ fake_table.maxlen = sizeof(buf);
++
++ return proc_dostring(&fake_table, write, buffer, lenp, ppos);
++}
++
++static int lrng_proc_do_entropy(struct ctl_table *table, int write,
++ void *buffer, size_t *lenp, loff_t *ppos)
++{
++ struct ctl_table fake_table;
++ int entropy_count;
++
++ entropy_count = lrng_avail_entropy();
++
++ fake_table.data = &entropy_count;
++ fake_table.maxlen = sizeof(entropy_count);
++
++ return proc_dointvec(&fake_table, write, buffer, lenp, ppos);
++}
++
++static int lrng_proc_do_poolsize(struct ctl_table *table, int write,
++ void *buffer, size_t *lenp, loff_t *ppos)
++{
++ struct ctl_table fake_table;
++ int entropy_count;
++
++ /* LRNG can at most retain entropy in per-CPU pools and aux pool */
++ entropy_count = lrng_get_digestsize() + lrng_pcpu_avail_pool_size();
++
++ fake_table.data = &entropy_count;
++ fake_table.maxlen = sizeof(entropy_count);
++
++ return proc_dointvec(&fake_table, write, buffer, lenp, ppos);
++}
++
++static int lrng_min_write_thresh;
++static int lrng_max_write_thresh = LRNG_MAX_DIGESTSIZE;
++static char lrng_sysctl_bootid[16];
++static int lrng_drng_reseed_max_min;
++
++struct ctl_table random_table[] = {
++ {
++ .procname = "poolsize",
++ .maxlen = sizeof(int),
++ .mode = 0444,
++ .proc_handler = lrng_proc_do_poolsize,
++ },
++ {
++ .procname = "entropy_avail",
++ .maxlen = sizeof(int),
++ .mode = 0444,
++ .proc_handler = lrng_proc_do_entropy,
++ },
++ {
++ .procname = "write_wakeup_threshold",
++ .data = &lrng_write_wakeup_bits,
++ .maxlen = sizeof(int),
++ .mode = 0644,
++ .proc_handler = proc_dointvec_minmax,
++ .extra1 = &lrng_min_write_thresh,
++ .extra2 = &lrng_max_write_thresh,
++ },
++ {
++ .procname = "boot_id",
++ .data = &lrng_sysctl_bootid,
++ .maxlen = 16,
++ .mode = 0444,
++ .proc_handler = lrng_proc_do_uuid,
++ },
++ {
++ .procname = "uuid",
++ .maxlen = 16,
++ .mode = 0444,
++ .proc_handler = lrng_proc_do_uuid,
++ },
++ {
++ .procname = "urandom_min_reseed_secs",
++ .data = &lrng_drng_reseed_max_time,
++ .maxlen = sizeof(int),
++ .mode = 0644,
++ .proc_handler = proc_dointvec,
++ .extra1 = &lrng_drng_reseed_max_min,
++ },
++ { }
++};
++
++/* Number of online DRNGs */
++static u32 numa_drngs = 1;
++
++void lrng_pool_inc_numa_node(void)
++{
++ numa_drngs++;
++}
++
++static int lrng_proc_type_show(struct seq_file *m, void *v)
++{
++ struct lrng_drng *lrng_drng_init = lrng_drng_init_instance();
++ unsigned long flags = 0;
++ unsigned char buf[390];
++
++ lrng_drng_lock(lrng_drng_init, &flags);
++ snprintf(buf, sizeof(buf),
++ "DRNG name: %s\n"
++ "Hash for reading entropy pool: %s\n"
++ "Hash for operating aux entropy pool: %s\n"
++ "LRNG security strength in bits: %d\n"
++ "per-CPU interrupt collection size: %u\n"
++ "number of DRNG instances: %u\n"
++ "Standards compliance: %s%s\n"
++ "High-resolution timer: %s\n"
++ "LRNG minimally seeded: %s\n"
++ "LRNG fully seeded: %s\n"
++ "Continuous compression: %s\n",
++ lrng_drng_init->crypto_cb->lrng_drng_name(),
++ lrng_drng_init->crypto_cb->lrng_hash_name(),
++ lrng_drng_init->crypto_cb->lrng_hash_name(),
++ lrng_security_strength(),
++ LRNG_DATA_NUM_VALUES,
++ numa_drngs,
++ lrng_sp80090b_compliant() ? "SP800-90B " : "",
++ lrng_sp80090c_compliant() ? "SP800-90C " : "",
++ lrng_pool_highres_timer() ? "true" : "false",
++ lrng_state_min_seeded() ? "true" : "false",
++ lrng_state_fully_seeded() ? "true" : "false",
++ lrng_pcpu_continuous_compression_state() ? "true" : "false");
++ lrng_drng_unlock(lrng_drng_init, &flags);
++
++ seq_write(m, buf, strlen(buf));
++
++ return 0;
++}
++
++static int __init lrng_proc_type_init(void)
++{
++ proc_create_single("lrng_type", 0444, NULL, &lrng_proc_type_show);
++ return 0;
++}
++
++module_init(lrng_proc_type_init);
diff --git a/PATCH/LRNG/696-07-v41-0004-LRNG-add-switchable-DRNG-support.patch b/PATCH/LRNG/696-07-v41-0004-LRNG-add-switchable-DRNG-support.patch
new file mode 100644
index 000000000..975c74cf0
--- /dev/null
+++ b/PATCH/LRNG/696-07-v41-0004-LRNG-add-switchable-DRNG-support.patch
@@ -0,0 +1,326 @@
+From 2ea9e51004f8a85964977ec7cd26f9f5a294295a Mon Sep 17 00:00:00 2001
+From: Stephan Mueller
+Date: Fri, 18 Jun 2021 08:06:39 +0200
+Subject: [PATCH v41 04/13] LRNG - add switchable DRNG support
+
+The DRNG switch support allows replacing the DRNG mechanism of the
+LRNG. The switching support rests on the interface definition of
+include/linux/lrng.h. A new DRNG is implemented by filling in the
+interface defined in this header file.
+
+In addition to the DRNG, the extension also has to provide a hash
+implementation that is used to hash the entropy pool for random number
+extraction.
+
+Note: It is permissible to implement a DRNG whose operations may sleep.
+However, the hash function must not sleep.
+
+The switchable DRNG support allows replacing the DRNG at runtime.
+However, only one DRNG extension is allowed to be loaded at any given
+time. Before replacing it with another DRNG implementation, the possibly
+existing DRNG extension must be unloaded.
+
+The switchable DRNG extension activates the new DRNG during load time.
+It is expected, however, that such a DRNG switch would be done only once
+by an administrator to load the intended DRNG implementation.
+
+It is permissible to compile DRNG extensions either as kernel modules or
+statically. The initialization of the DRNG extension should be performed
+with a late_initcall to ensure the extension is available when user
+space starts but after all other initialization completed.
+The initialization is performed by registering the function call data
+structure with the lrng_set_drng_cb function. In order to unload the
+DRNG extension, lrng_set_drng_cb must be invoked with the NULL
+parameter.
+
+The DRNG extension should always provide a security strength that is at
+least as strong as LRNG_DRNG_SECURITY_STRENGTH_BITS.
+
+The hash extension must not sleep and must not maintain a separate
+state.
+
+CC: Torsten Duwe
+CC: "Eric W. Biederman"
+CC: "Alexander E. Patrakov"
+CC: "Ahmed S. Darwish"
+CC: "Theodore Y. Ts'o"
+CC: Willy Tarreau
+CC: Matthew Garrett
+CC: Vito Caputo
+CC: Andreas Dilger
+CC: Jan Kara
+CC: Ray Strode
+CC: William Jon McCann
+CC: zhangjs
+CC: Andy Lutomirski
+CC: Florian Weimer
+CC: Lennart Poettering