elipdotter/
index.rs

//! The index (lookup table of words) lives here.
//! The trait [`Provider`] (and [`OccurenceProvider`])
//! enables multiple types of indices to be defined.
//!
//! The only one (for now) is [`Simple`]. That stores a list of all the documents
//! which contains each word in the input data (e.g. web pages). It then fetches those documents
//! again and finds occurrences within those.
//!
//! ---
//!
//! The [`DocumentMap`] makes it performant to get the document ID from name and vice versa.

use std::any::Any;
use std::cmp;
use std::collections::{btree_map, btree_set, BTreeMap, BTreeSet, HashMap};
use std::fmt::{Debug, Display};
use std::iter::{Copied, Map};
use std::sync::{Arc, Mutex};

use crate::proximity;

/// Id of a document.
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)]
pub struct Id(u64);
impl Id {
    fn new(id: u64) -> Self {
        Self(id)
    }
    #[must_use]
    pub fn inner(self) -> u64 {
        self.0
    }
}

/// An occurrence of [`crate::Query`].
#[derive(Debug, PartialEq, PartialOrd, Clone)]
pub struct Occurence {
    start: usize,
    pub(crate) document_id: Id,
    pub(crate) word_id: u32,
    rating: f32,
}
impl Occurence {
    #[inline]
    fn new(pos: usize, document_id: Id, word_id: u32) -> Self {
        Self {
            start: pos,
            document_id,
            word_id,
            rating: 0.0,
        }
    }
    #[must_use]
    #[inline]
    pub fn start(&self) -> usize {
        self.start
    }
    #[must_use]
    #[inline]
    pub fn id(&self) -> Id {
        self.document_id
    }
    #[must_use]
    #[inline]
    pub fn word_id(&self) -> u32 {
        self.word_id
    }
    #[must_use]
    #[inline]
    pub fn rating(&self) -> f32 {
        self.rating
    }
    #[inline]
    pub(crate) fn rating_mut(&mut self) -> &mut f32 {
        &mut self.rating
    }
    #[inline]
    pub(crate) fn start_mut(&mut self) -> &mut usize {
        &mut self.start
    }
}
/// If [`Occurence`] is part of an AND, these can be associated to tell where the other parts of
/// the AND chain are.
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub struct AssociatedOccurrence {
    start: usize,
    word_id: u32,
}
impl From<&crate::Hit> for AssociatedOccurrence {
    #[inline]
    fn from(hit: &crate::Hit) -> Self {
        Self {
            start: hit.start(),
            word_id: hit.word_id(),
        }
    }
}
impl From<&Occurence> for AssociatedOccurrence {
    #[inline]
    fn from(occ: &Occurence) -> Self {
        Self {
            start: occ.start(),
            word_id: occ.word_id(),
        }
    }
}
impl AssociatedOccurrence {
    pub(crate) fn new(start: usize, word_id: u32) -> Self {
        Self { start, word_id }
    }
    /// Get the associated occurrence's start.
    #[must_use]
    #[inline]
    pub fn start(&self) -> usize {
        self.start
    }
    /// Get the associated occurrence's word id.
    #[must_use]
    #[inline]
    pub fn word_id(&self) -> u32 {
        self.word_id
    }
}

/// Wrapper for representing `T` as only containing alphanumeric characters.
///
/// The [`Eq`] and [`Ord`] implementations discard any eventual non-alphanumeric characters.
///
/// Can with benefits be used with [`StrPtr`].
#[derive(Clone, Hash)]
pub struct Alphanumeral<T: ?Sized> {
    // used for ranges in btreemaps.
    include_other: bool,
    s: T,
}
impl<T> Alphanumeral<T> {
    #[allow(clippy::inline_always)]
    #[inline(always)]
    pub fn new(s: T) -> Self {
        Self {
            include_other: false,
            s,
        }
    }
    fn whole(s: T) -> Self {
        Self {
            include_other: true,
            s,
        }
    }
    #[allow(clippy::inline_always)]
    #[inline(always)]
    pub fn owned(&self) -> Alphanumeral<T::Owned>
    where
        T: ToOwned + AsRef<str>,
    {
        Alphanumeral::new(self.s.to_owned())
    }
}
impl<T: AsRef<str>> From<T> for Alphanumeral<T> {
    fn from(v: T) -> Self {
        Self::new(v)
    }
}
impl<T: AsRef<str>> Alphanumeral<T> {
    #[allow(clippy::inline_always)]
    #[inline(always)]
    pub fn chars(&self) -> impl Iterator<Item = char> + Clone + '_ {
        self.s
            .as_ref()
            .chars()
            .filter(|c: &char| c.is_alphanumeric() || self.include_other)
            .flat_map(char::to_lowercase)
    }
}
impl<T: AsRef<str>> AsRef<str> for Alphanumeral<T> {
    fn as_ref(&self) -> &str {
        self.s.as_ref()
    }
}
impl<T1: AsRef<str>, T2: AsRef<str>> PartialEq<Alphanumeral<T1>> for Alphanumeral<T2> {
    fn eq(&self, other: &Alphanumeral<T1>) -> bool {
        self.chars().eq(other.chars())
    }
}
// impl<T1: AsRef<str> + Any, T2: AsRef<str> + Any> PartialEq<T1> for Alphanumeral<T2> {
//     fn eq(&self, other: &T1) -> bool {
//         if let Some(a) = (other as &dyn Any).downcast_ref::<Alphanumeral<T2>>() {
//             return self.chars().eq(a.chars());
//         }
//         self.chars().eq(Alphanumeral::new(other).chars())
//     }
// }
impl<T: AsRef<str> + Any> Eq for Alphanumeral<T> {}
impl<T: AsRef<str> + Any> PartialOrd for Alphanumeral<T> {
    fn partial_cmp(&self, other: &Self) -> Option<cmp::Ordering> {
        Some(self.cmp(other))
    }
}
impl<T: AsRef<str> + Any> Ord for Alphanumeral<T> {
    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
        self.chars().cmp(other.chars())
    }
}
impl<T: AsRef<str>> Debug for Alphanumeral<T> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "\"{}\"", self.chars().collect::<String>())
    }
}
impl<T: AsRef<str>> Display for Alphanumeral<T> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.write_str(&self.chars().collect::<String>())
    }
}

/// Needed to index a [custom struct](Alphanumeral) in maps.
/// We have to have the same type, so this acts as both the borrowed and owned.
/// `TODO`: `Cow`? But lifetime is a nightmare?
pub struct StrPtr {
    s: *const str,
    /// drop if it's owned
    owned: bool,
}
impl StrPtr {
    /// # Safety
    ///
    /// `s` must be valid for the lifetime of this object.
    ///
    /// If `drop` is true, we must also own `s`.
    #[allow(clippy::inline_always)]
    #[inline(always)]
    unsafe fn new(s: &str, drop: bool) -> Self {
        Self { s, owned: drop }
    }
    /// # Safety
    ///
    /// `s` must be valid for the lifetime of this object.
    #[allow(clippy::inline_always)]
    #[inline(always)]
    unsafe fn sref<'a>(s: &'a str) -> Self
    where
        Self: 'a,
    {
        // Since `drop` is false, we don't care about ownership.
        Self::new(s, false)
    }
    #[allow(clippy::inline_always)]
    #[inline(always)]
    fn owned(s: impl Into<Box<str>>) -> Self {
        let s = std::mem::ManuallyDrop::new(s.into());
        // SAFETY: Since we have the ownership and used `ManuallyDrop`,
        // the memory will never drop.
        // We have to drop it, and it's valid until we do so.
        unsafe { Self::new(&s, true) }
    }
    #[allow(clippy::inline_always)]
    #[inline(always)]
    fn as_mut(&self) -> *mut str {
        self.s as *mut _
    }
}
impl Drop for StrPtr {
    fn drop(&mut self) {
        if self.owned {
            // SAFETY: Upheld by caller.
            unsafe { (self.as_mut()).drop_in_place() }
        }
    }
}
impl AsRef<str> for StrPtr {
    #[allow(clippy::inline_always)]
    #[inline(always)]
    fn as_ref(&self) -> &str {
        // SAFETY: Guaranteed by caller of [`Self::new`] that the string is valid for the lifetime
        // of this struct.
        unsafe { &*self.s }
    }
}
impl Debug for StrPtr {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        std::fmt::Debug::fmt(&self.as_ref(), f)
    }
}
impl Display for StrPtr {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        std::fmt::Display::fmt(&self.as_ref(), f)
    }
}
impl Clone for StrPtr {
    fn clone(&self) -> Self {
        Self::owned(self.as_ref())
    }
}
unsafe impl Send for StrPtr {}
unsafe impl Sync for StrPtr {}

pub(crate) type AlphanumRef = Alphanumeral<StrPtr>;

/// Map of documents and their [`Id`]s to quickly get name from id and vice versa.
#[derive(Debug)]
#[must_use]
pub struct DocumentMap {
    name_to_id: HashMap<String, Id>,
    id_to_name: BTreeMap<Id, String>,
}
impl DocumentMap {
    pub fn new() -> Self {
        Self {
            name_to_id: HashMap::new(),
            id_to_name: BTreeMap::new(),
        }
    }
    #[allow(clippy::missing_panics_doc)]
    pub fn reserve_id(&mut self, name: impl Into<String> + AsRef<str>) -> Id {
        if let Some(id) = self.get_id(name.as_ref()) {
            return id;
        }

        let name = name.into();

        let id = self.get_first();

        self.name_to_id.insert(name.clone(), id);
        let old_doc = self.id_to_name.insert(id, name);
        assert_eq!(old_doc, None);
        id
    }
    /// Alternatively, [`Self::reserve_id`] and then drop the lock, go to a different thread, and
    /// make a new [`Simple`]. [`Simple::digest_document`], then [`Simple::ingest`] the
    /// second index into the first.
    pub fn insert<'a>(
        &mut self,
        name: impl Into<String> + AsRef<str>,
        content: &str,
        provider: &mut impl Provider<'a>,
    ) {
        let id = self.reserve_id(name);
        provider.digest_document(id, content);
    }
    /// Get first unused [`Id`].
    fn get_first(&self) -> Id {
        if self.id_to_name.is_empty() {
            return Id::new(0);
        }
        let mut last = 0;
        for id in self.id_to_name.keys() {
            if id.inner() != last && id.inner() != last + 1 {
                return Id::new(last + 1);
            }
            last = id.inner();
        }
        Id::new(last + 1)
    }
    /// O(1)
    #[must_use]
    pub fn contains(&self, name: &str) -> bool {
        self.get_id(name).is_some()
    }
    /// O(1)
    #[must_use]
    pub fn get_id(&self, name: &str) -> Option<Id> {
        self.name_to_id.get(name).copied()
    }
    /// O(log n)
    ///
    /// But since this is a BTree, this is often faster than O(1) in a hash map,
    /// if we assume you have < 10_000 documents.
    #[must_use]
    pub fn get_name(&self, id: Id) -> Option<&str> {
        self.id_to_name.get(&id).map(|s| &**s)
    }

    // Option:
    //  If we have UUIDs instead, we can just remove the Id from `self`,
    //  as any attempts to resolve that into a path will fail.
    #[allow(clippy::missing_panics_doc)]
    pub fn force_remove<'a>(&mut self, document: Id, provider: &mut impl Provider<'a>) {
        let previous = if let Some(prev) = self.id_to_name.remove(&document) {
            prev
        } else {
            return;
        };
        let previous = self.name_to_id.remove(&previous);
        assert_eq!(previous, Some(document));

        provider.remove_document(document);
    }
}

impl Default for DocumentMap {
    fn default() -> Self {
        Self::new()
    }
}

/// [`Eq`] isn't implemented as you'd probably want to check which document it belongs to as well.
#[derive(Debug, Clone)]
pub struct WordOccurrence {
    pos: usize,
}
impl WordOccurrence {
    /// Get the occurrence of the word's position.
    #[must_use]
    pub fn position(&self) -> usize {
        self.pos
    }
}
/// Returns the next valid UTF-8 character.
#[must_use]
pub fn next_char(c: char) -> char {
    let mut int = c as u32;
    let max = char::MAX as u32;
    // Checking the Rust source, all u32 below [`char::MAX`] and not between the range specified
    // below are OK, so the loop here shouldn't do anything.
    // Keeping it just in case of an API change, but since this is unicode, that's highly unlikely.
    loop {
        int += 1;
        if (0xD800..=0xDFFF).contains(&int) {
            int = 0xE000;
        }
        if let Some(c) = char::from_u32(int) {
            return c;
        }
        if int >= max {
            return c;
        }
    }
}
/// Allows to insert words and remove occurrences from documents.
pub trait Provider<'a> {
    type DocumentIter: Iterator<Item = Id> + ExactSizeIterator + 'a;
    type WordIter: Iterator<Item = &'a Arc<AlphanumRef>> + 'a;
    type WordFilteredIter: Iterator<Item = &'a Arc<AlphanumRef>> + 'a;

    /// The implementation should convert `word` to lower-case and remove any hyphen `-`.
    ///
    /// [`Alphanumeral`] can be used for this purpose.
    fn insert_word(&mut self, word: impl AsRef<str>, document: Id, index: usize);
    /// Removes all registered occurrences of `document` from all words.
    fn remove_document(&mut self, document: Id);
    /// If `document` contains at least one occurrence of `word`.
    fn contains_word(&self, word: impl AsRef<str>, document: Id) -> bool;
    fn documents_with_word(&'a self, word: impl AsRef<str>) -> Option<Self::DocumentIter>;
    /// Estimate of the number of bytes this index uses in memory,
    /// specifically the heap.
    ///
    /// This assumes you're on a 64-bit architecture. Else, you can expect the size to be half, as
    /// most of what's stored in an index is pointers to data.
    ///
    /// This should not be relied on. Rather, it's a *hint*.
    fn size(&self) -> usize;

    fn word_count_upper_limit(&self) -> usize;
    /// The limit of [`Self::word_count_upper_limit`] where [`Self::words_starting_with`] is used
    /// instead of [`Self::words`].
    fn word_count_limit(&self) -> usize;
    fn words(&'a self) -> Self::WordIter;
    fn words_starting_with(&'a self, c: char) -> Self::WordFilteredIter;

    fn word_proximity_threshold(&self) -> f32;
    fn word_proximity_algorithm(&self) -> proximity::Algorithm;

    /// Only adds words which are alphanumeric.
    fn digest_document(&mut self, id: Id, content: &str) {
        for item in SplitNonAlphanumeric::new(content) {
            let word = item.word();
            let index = item.index();
            if word.is_empty() {
                continue;
            }
            // Word must be alphanumeric to be added.
            // Words with hyphens are also accepted.
            if word.contains(|c: char| !c.is_alphanumeric() && !matches!(c, '-' | '—' | '–')) {
                continue;
            }

            self.insert_word(word, id, index);
        }
    }
}
pub trait OccurenceProvider<'a> {
    type Iter: Iterator<Item = Occurence> + 'a;
    ///
    /// `word_count_limit` is the limit where only words starting with the first [`char`] of `word`
    /// will be checked for proximity.
    fn occurrences_of_word(&'a self, word: &'a str) -> Option<Self::Iter>;
}

#[derive(Debug, Clone)]
enum SplitItem<'a> {
    /// A word was found
    Word { word: &'a str, index: usize },
    /// A hyphen-separated word was found.
    /// This can be treated as **one** word and be added, which improves search results.
    ///
    /// Adding this makes query `nextgen` match text `next-gen`.
    /// For some magical reason, the [`crate::proximity`] already provided query `generation`
    /// matching for `nextgenerationnext`, even though it's in the middle of a string.
    Hyphen {
        /// The word with the hyphen still in it.
        word: &'a str,
        /// Index of the start of the word in the original string.
        index: usize,
    },
}
impl<'a> SplitItem<'a> {
    #[inline]
    fn word(&self) -> &'a str {
        match self {
            Self::Word { word, index: _ } | Self::Hyphen { word, index: _ } => word,
        }
    }
    #[inline]
    fn index(&self) -> usize {
        match self {
            Self::Word { word: _, index } | Self::Hyphen { word: _, index } => *index,
        }
    }
}
#[derive(Debug)]
struct SplitNonAlphanumeric<'a> {
    s: &'a str,
    start: usize,
    hyphen_start: usize,
    hyphen_second_word: bool,
}
impl<'a> SplitNonAlphanumeric<'a> {
    fn new(s: &'a str) -> Self {
        Self {
            s,
            start: 0,
            hyphen_start: 0,
            hyphen_second_word: false,
        }
    }
}
impl<'a> Iterator for SplitNonAlphanumeric<'a> {
    type Item = SplitItem<'a>;
    fn next(&mut self) -> Option<Self::Item> {
        let iter = self.s[self.start..].char_indices();
        // Not for because we need `iter.next()` below.
        for (pos, c) in iter {
            if !c.is_alphanumeric() {
                if self.hyphen_second_word {
                    let hyphen_word = &self.s[self.hyphen_start..self.start + pos];
                    self.hyphen_second_word = false;
                    // When the caller calls `next` again, we start over at the beginning of this
                    // word, since we didn't change `self.start` before returning.
                    return Some(SplitItem::Hyphen {
                        word: hyphen_word,
                        index: self.hyphen_start,
                    });
                }
                if matches!(c, '-' | '—' | '–') {
                    self.hyphen_start = self.start;
                    self.hyphen_second_word = true;
                }
                let old_start = self.start;
                self.start = self.start.saturating_add(pos + c.len_utf8());
                return Some(SplitItem::Word {
                    index: old_start,
                    word: self.s.get(old_start..old_start + pos).unwrap(),
                });
            }
        }
        let s = &self.s[self.start..];
        if !s.is_empty() {
            self.start += s.len();
            return Some(SplitItem::Word {
                index: self.start,
                word: s,
            });
        }
        None
    }
}
#[derive(Debug)]
enum ProximateListOrSingle<'a> {
    Single(&'a str),
    List(&'a proximity::ProximateList),
}
impl<'a> ProximateListOrSingle<'a> {
    #[allow(clippy::inline_always)]
    #[inline(always)]
    fn get(&self, word: &AlphanumRef) -> Option<f32> {
        match self {
            Self::Single(single) => {
                // SAFETY: We only use it in this scope.
                let str_ptr = unsafe { StrPtr::sref(single) };
                if word == &Alphanumeral::new(str_ptr) {
                    Some(1.0)
                } else {
                    None
                }
            }
            Self::List(list) => list.words.get(word).copied(),
        }
    }
}

#[derive(Debug)]
#[must_use]
pub struct SimpleDocMap {
    /// We need the order of documents to be sorted, as the sorted iterators relies on that.
    docs: BTreeSet<Id>,
}
impl SimpleDocMap {
    pub fn new() -> Self {
        Self {
            docs: BTreeSet::new(),
        }
    }

    pub fn insert(&mut self, document: Id) {
        self.docs.insert(document);
    }
    pub fn remove(&mut self, document: Id) {
        self.docs.remove(&document);
    }
    #[must_use]
    pub fn contains(&self, document: Id) -> bool {
        self.docs.contains(&document)
    }
    pub fn documents(&self) -> Copied<btree_set::Iter<'_, Id>> {
        self.docs.iter().copied()
    }
}
impl Default for SimpleDocMap {
    fn default() -> Self {
        Self::new()
    }
}

#[derive(Debug)]
#[must_use]
pub struct Simple {
    words: BTreeMap<Arc<AlphanumRef>, SimpleDocMap>,

    proximity_threshold: f32,
    proximity_algo: proximity::Algorithm,
    word_count_limit: usize,
}
impl Simple {
    /// `proximity_threshold` is the threshold where alike words are also accepted.
    /// It uses the range [0..1], where values nearer 0 allow more words.
    /// The default is `0.9`.
    ///
    /// `proximity_threshold` is the algorithm used for proximity checking of words.
    ///
    /// `word_count_limit` is the number of words in this index where only words with the first
    /// character is used for approximate matching.
    /// Default is `2_500`.
    pub fn new(
        proximity_threshold: f32,
        proximity_algorithm: proximity::Algorithm,
        word_count_limit: usize,
    ) -> Self {
        Self {
            words: BTreeMap::new(),
            proximity_threshold,
            proximity_algo: proximity_algorithm,
            word_count_limit,
        }
    }
    /// Merges `other` with `self`.
    pub fn ingest(&mut self, other: Self) {
        for (word, docs) in other.words {
            if let Some(my_docs) = self.words.get_mut(&word) {
                for doc in docs.documents() {
                    my_docs.insert(doc);
                }
            } else {
                self.words.insert(word, docs);
            }
        }
    }
}
/// [`Self::new`] with `proximity_threshold` set to `0.85`.
impl Default for Simple {
    fn default() -> Self {
        Self::new(0.85, proximity::Algorithm::Hamming, 2_500)
    }
}
type FirstTy<'a, T, I> = fn((&'a T, &'a I)) -> &'a T;
impl<'a> Provider<'a> for Simple {
    type DocumentIter = Copied<btree_set::Iter<'a, Id>>;
    type WordIter = btree_map::Keys<'a, Arc<AlphanumRef>, SimpleDocMap>;
    type WordFilteredIter = Map<
        btree_map::Range<'a, Arc<AlphanumRef>, SimpleDocMap>,
        FirstTy<'a, Arc<AlphanumRef>, SimpleDocMap>,
    >;

    /// O(log n log n)
    fn insert_word(&mut self, word: impl AsRef<str>, document: Id, _index: usize) {
        let word = word.as_ref();
        // SAFETY: We only use `StrPtr` in the current scope.
        let ptr = unsafe { StrPtr::sref(word) };
        if let Some(word_docs) = self.words.get_mut(&Alphanumeral::new(ptr)) {
            word_docs.insert(document);
        } else {
            let mut word_docs = SimpleDocMap::new();
            word_docs.insert(document);
            let word = Alphanumeral::new(word).chars().collect::<String>();
            let ptr = StrPtr::owned(word);
            self.words
                .insert(Arc::new(Alphanumeral::new(ptr)), word_docs);
        }
    }
    /// O(n log n)
    fn remove_document(&mut self, document: Id) {
        for word_docs in self.words.values_mut() {
            word_docs.remove(document);
        }
    }
    /// O(log n log n)
    fn contains_word(&self, word: impl AsRef<str>, document: Id) -> bool {
        let word = word.as_ref();
        // SAFETY: We only use `StrPtr` in the current scope.
        let ptr = unsafe { StrPtr::sref(word) };
        self.words
            .get(&Alphanumeral::new(ptr))
            .map_or(false, |word| word.contains(document))
    }
    /// O(log n)
    ///
    /// Iterator is O(1) - running this and consuming the iterator is O(n log n).
    ///
    /// You can get the length of the list using the [`ExactSizeIterator`] trait.
    fn documents_with_word(&'a self, word: impl AsRef<str>) -> Option<Self::DocumentIter> {
        // SAFETY: We only use `StrPtr` in the current scope.
        let ptr = unsafe { StrPtr::sref(word.as_ref()) };
        self.words
            .get(&Alphanumeral::new(ptr))
            .map(SimpleDocMap::documents)
    }
    fn size(&self) -> usize {
        self.words
            .iter()
            // +16 for Arc size and btree size
            .map(|(word, docs)| word.s.as_ref().len() + 16 + docs.docs.len() * 8 + 16)
            .sum()
    }

    fn word_count_upper_limit(&self) -> usize {
        self.words.len()
    }
    fn word_count_limit(&self) -> usize {
        self.word_count_limit
    }
    fn words(&'a self) -> Self::WordIter {
        self.words.keys()
    }
    fn words_starting_with(&'a self, c: char) -> Self::WordFilteredIter {
        let s1 = String::from(c);
        let ptr1 = StrPtr::owned(s1);
        let s2 = String::from(next_char(c));
        let ptr2 = StrPtr::owned(s2);

        self.words
            .range(Alphanumeral::new(ptr1)..Alphanumeral::new(ptr2))
            .map::<&Arc<AlphanumRef>, _>(|(k, _v)| k)
    }

    fn word_proximity_threshold(&self) -> f32 {
        self.proximity_threshold
    }
    fn word_proximity_algorithm(&self) -> proximity::Algorithm {
        self.proximity_algo
    }
}

#[derive(Debug)]
pub struct SimpleOccurrencesIter<'a, I> {
    iter: I,
    words: ProximateListOrSingle<'a>,

    document_contents: &'a HashMap<Id, Arc<String>>,

    #[allow(clippy::type_complexity)] // That's not complex.
    current: Option<(SplitNonAlphanumeric<'a>, Id, &'a AlphanumRef)>,
    current_doc_matched: bool,

    missing: &'a Mutex<Vec<(Id, &'a AlphanumRef)>>,
}
impl<'a, I: Iterator<Item = (Id, &'a AlphanumRef, f32)>> SimpleOccurrencesIter<'a, I> {
    fn new(
        iter: I,
        words: ProximateListOrSingle<'a>,
        document_contents: &'a HashMap<Id, Arc<String>>,
        missing: &'a Mutex<Vec<(Id, &'a AlphanumRef)>>,
    ) -> Self {
        Self {
            iter,
            words,
            document_contents,
            current: None,
            current_doc_matched: false,
            missing,
        }
    }
}
impl<'a, I: Iterator<Item = (Id, &'a AlphanumRef, f32)>> Iterator for SimpleOccurrencesIter<'a, I> {
    type Item = Occurence;
    fn next(&mut self) -> Option<Self::Item> {
        if let Some((words_in_doc, doc_id, searching_word)) = &mut self.current {
            for item in words_in_doc {
                let word_in_doc = item.word();
                let start = item.index();

                // SAFETY: We only use it in this scope.
                let word_ptr = unsafe { StrPtr::sref(word_in_doc) };
                let word_ptr = Alphanumeral::new(word_ptr);

                let word = Alphanumeral::new(word_in_doc);
                if word.chars().next().is_none() {
                    continue;
                }
                if let Some(word_proximity) = self.words.get(&word_ptr) {
                    self.current_doc_matched = true;
                    let rating = (word_proximity - 1.0) * 4.0;
                    let mut occ = Occurence::new(start, *doc_id, 0);
                    *occ.rating_mut() += rating;
                    return Some(occ);
                }
            }

            if !self.current_doc_matched {
                let mut missing = self.missing.lock().expect(
                    "lock is poisoned, other thread panicked.\
                    This should anyway not be accessed from multiple threads simultaneously",
                );
                missing.push((*doc_id, searching_word));
            }
        }

        let (next_doc, word, _) = self.iter.next()?;
        let contents = if let Some(c) = self.document_contents.get(&next_doc) {
            c
        } else {
            return self.next();
        };
        self.current = Some((SplitNonAlphanumeric::new(contents), next_doc, word));
        self.next()
    }
    fn size_hint(&self) -> (usize, Option<usize>) {
        self.iter.size_hint()
    }
}
#[derive(Debug)]
#[must_use]
pub struct SimpleOccurences<'a> {
    index: &'a Simple,
    word_proximates: &'a proximity::ProximateMap<'a>,
    document_contents: HashMap<Id, Arc<String>>,

    missing: Mutex<Vec<(Id, &'a AlphanumRef)>>,
}
/// Once this is used, call [`Self::missing`] and [`MissingOccurrences::apply`] to remove the missing entries found during
/// the search.
///
/// # Panics
///
/// Using the [`OccurenceProvider::occurrences_of_word`] may panic, if not all documents returned
/// from [`Provider::documents_with_word`]. If a document doesn't exist, still [`Self::add_document`],
/// but with an empty [`String`].
impl<'a> SimpleOccurences<'a> {
    /// The [`proximity::ProximateMap`] can be acquired from
    /// [`crate::query::Documents::take_proximate_map`] which is returned from [`crate::Query::documents`].
    pub fn new(index: &'a Simple, word_proximates: &'a proximity::ProximateMap<'a>) -> Self {
        Self {
            index,
            word_proximates,
            document_contents: HashMap::new(),
            missing: Mutex::new(Vec::new()),
        }
    }
    pub fn add_document(&mut self, id: Id, content: Arc<String>) {
        self.document_contents.insert(id, content);
    }

    /// Remove the missing references.
    #[allow(clippy::missing_panics_doc)]
    pub fn missing(&self) -> MissingOccurrences {
        MissingOccurrences {
            list:
elipdotter/index.rs

elipdotter/
index.rs