{-# LANGUAGE BangPatterns #-}
{-# OPTIONS_HADDOCK hide, prune #-}
-- |
-- Module         : Data.ByteString.Search.Internal.KnuthMorrisPratt
-- Copyright      : Justin Bailey
--                  Chris Kuklewicz
--                  Daniel Fischer
-- Licence        : BSD3
-- Maintainer     : Daniel Fischer <daniel.is.fischer@googlemail.com>
-- Stability      : Provisional
-- Portability    : non-portable (BangPatterns)
--
-- Fast Knuth-Morris-Pratt search of both strict and
-- lazy 'S.ByteString' values.
--
-- A description of the algorithm can be found at
-- <http://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm>.

-- Original authors: Justin Bailey (jgbailey at gmail.com) and
-- Chris Kuklewicz (haskell at list.mightyreason.com).

module Data.ByteString.Search.Internal.KnuthMorrisPratt ( -- * Overview
                                                          -- $overview

                                                          -- * Partial application
                                                          -- $partial

                                                          -- * Complexity and Performance
                                                          -- $complexity

                                                          -- * Finding substrings
                                                          -- ** Overlapping
                                                            indicesL
                                                          , indicesS
                                                          -- ** Non-overlapping
                                                          , matchLL
                                                          , matchLS
                                                          , matchSL
                                                          , matchSS
                                                          ) where

import Data.ByteString.Search.Internal.Utils (kmpBorders, strictify)

import qualified Data.ByteString as S
import qualified Data.ByteString.Lazy as L
import Data.ByteString.Unsafe (unsafeIndex)

import Data.Array.Base (unsafeAt)
--import Data.Array.Unboxed

import Data.Int (Int64)

-- $overview
--
-- This module exports 6 search functions: 'matchLL', 'matchLS',
-- 'matchSL', and 'matchSS', which find the indices of all non-overlapping
-- occurrences of a pattern in a target string, and the newly added
-- 'indicesL' and 'indicesS' which find the indices of
-- all (possibly overlapping) occurrences of the pattern in the target
-- string. The performance should be the same when the pattern can't
-- overlap, but when the pattern occurs often and can have significant
-- overlap, the search excluding the overlap is faster.
--
-- In all cases, the list of indices is produced lazily.
--
-- The behaviour of the old @matchXY@ functions for an empty pattern has
-- changed, formerly they returned an empty list, now all functions
-- return @[0 .. 'length' target]@ for an empty pattern.
--
-- The return type of the @matchXS@ functions changed to @['Int']@, since
-- strict ByteStrings are @'Int'@-indexed.
--
-- The trailing @L\/S@ in the function names indicate whether they work
-- on lazy or strict ByteStrings. Since all patterns are converted to
-- strict ByteStrings for performance reasons, the @matchLX@ add just
-- an additional bit of wrapping around the worker in comparison to
-- @matchSX@. For the new functions, no such wrapping is provided, you
-- have to 'strictify' lazy patterns before feeding them to the searcher.
-- The limit on the pattern lengths that the conversion to a strict
-- ByteString imposes should be irrelevant in practice.
--
-- The functions searching in lazy ByteStrings don't keep any references
-- to chunks already traversed. This means the garbage collector can free
-- those chunks early and only a small part of the target string needs to
-- be in memory.

-- $partial
--
-- These functions can all be usefully partially applied. Given only a
-- pattern, the auxiliary data will be computed only once, allowing for
-- efficient re-use.

-- $complexity
--
-- The preprocessing of the pattern is /O/(@patternLength@) in time and space.
-- The time complexity of the searching phase is /O/(@targetLength@) for all
-- functions.
--
-- In most cases, these functions are considerably slower than the
-- Boyer-Moore variants, performance is close to that of those from
-- "Data.ByteString.Search.DFA" resp. "Data.ByteString.Lazy.Search.DFA".

------------------------------------------------------------------------------
--                                 Wrappers                                 --
------------------------------------------------------------------------------

-- | @'indicesL'@ finds all indices of (possibly overlapping)
--   occurrences of the pattern in the target string.
{-# INLINE indicesL #-}
indicesL :: S.ByteString     -- ^ Strict pattern
         -> L.ByteString     -- ^ Lazy target string
         -> [Int64]          -- ^ Offsets of matches
indicesL :: ByteString -> ByteString -> [Int64]
indicesL ByteString
pat = [ByteString] -> [Int64]
search ([ByteString] -> [Int64])
-> (ByteString -> [ByteString]) -> ByteString -> [Int64]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> [ByteString]
L.toChunks
  where
    search :: [ByteString] -> [Int64]
search = Bool -> ByteString -> [ByteString] -> [Int64]
forall a. Integral a => Bool -> ByteString -> [ByteString] -> [a]
matcher Bool
True ByteString
pat

-- | @'indicesS'@ finds all indices of (possibly overlapping)
--   occurrences of the pattern in the target string.
{-# INLINE indicesS #-}
indicesS :: S.ByteString     -- ^ Strict pattern
         -> S.ByteString     -- ^ Strict target string
         -> [Int]            -- ^ Offsets of matches
indicesS :: ByteString -> ByteString -> [Int]
indicesS ByteString
pat = [ByteString] -> [Int]
search ([ByteString] -> [Int])
-> (ByteString -> [ByteString]) -> ByteString -> [Int]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (ByteString -> [ByteString] -> [ByteString]
forall a. a -> [a] -> [a]
:[])
  where
    search :: [ByteString] -> [Int]
search = Bool -> ByteString -> [ByteString] -> [Int]
forall a. Integral a => Bool -> ByteString -> [ByteString] -> [a]
matcher Bool
True ByteString
pat

-- | @'matchLL'@ finds the starting indices of all /non-overlapping/ occurrences
--   of the pattern in the target string. It is a simple wrapper around
--   'Data.ByteString.Lazy.Search.KMP.nonOverlappingIndices' strictifying
--   the pattern.
{-# INLINE matchLL #-}
matchLL :: L.ByteString           -- ^ Lazy pattern
        -> L.ByteString           -- ^ Lazy target string
        -> [Int64]                -- ^ Offsets of matches
matchLL :: ByteString -> ByteString -> [Int64]
matchLL ByteString
pat = [ByteString] -> [Int64]
search ([ByteString] -> [Int64])
-> (ByteString -> [ByteString]) -> ByteString -> [Int64]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> [ByteString]
L.toChunks
  where
    !spat :: ByteString
spat = ByteString -> ByteString
strictify ByteString
pat
    search :: [ByteString] -> [Int64]
search = Bool -> ByteString -> [ByteString] -> [Int64]
forall a. Integral a => Bool -> ByteString -> [ByteString] -> [a]
matcher Bool
False ByteString
spat

-- | @'matchLS'@ finds the starting indices of all /non-overlapping/ occurrences
--   of the pattern in the target string. It is a simple wrapper around
--   'Data.ByteString.Search.KMP.nonOverlappingIndices' strictifying
--   the pattern.
{-# INLINE matchLS #-}
matchLS :: L.ByteString         -- ^ Lazy pattern
        -> S.ByteString         -- ^ Strict target string
        -> [Int]                -- ^ Offsets of matches
matchLS :: ByteString -> ByteString -> [Int]
matchLS ByteString
pat = [ByteString] -> [Int]
search ([ByteString] -> [Int])
-> (ByteString -> [ByteString]) -> ByteString -> [Int]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (ByteString -> [ByteString] -> [ByteString]
forall a. a -> [a] -> [a]
:[])
  where
    !spat :: ByteString
spat = ByteString -> ByteString
strictify ByteString
pat
    search :: [ByteString] -> [Int]
search = Bool -> ByteString -> [ByteString] -> [Int]
forall a. Integral a => Bool -> ByteString -> [ByteString] -> [a]
matcher Bool
False ByteString
spat

-- | @'matchSS'@ finds the starting indices of all /non-overlapping/ occurrences
--   of the pattern in the target string. It is an alias for
--   'Data.ByteString.Search.KMP.nonOverlappingIndices'.
{-# INLINE matchSS #-}
matchSS :: S.ByteString         -- ^ Strict pattern
        -> S.ByteString         -- ^ Strict target string
        -> [Int]                -- ^ Offsets of matches
matchSS :: ByteString -> ByteString -> [Int]
matchSS ByteString
pat = [ByteString] -> [Int]
search ([ByteString] -> [Int])
-> (ByteString -> [ByteString]) -> ByteString -> [Int]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (ByteString -> [ByteString] -> [ByteString]
forall a. a -> [a] -> [a]
:[])
  where
    search :: [ByteString] -> [Int]
search = Bool -> ByteString -> [ByteString] -> [Int]
forall a. Integral a => Bool -> ByteString -> [ByteString] -> [a]
matcher Bool
False ByteString
pat

-- | @'matchSL'@ finds the starting indices of all /non-overlapping/ occurrences
--   of the pattern in the target string. It is an alias for
--   'Data.ByteString.Lazy.Search.KMP.nonOverlappingIndices'.
{-# INLINE matchSL #-}
matchSL :: S.ByteString         -- ^ Strict pattern
        -> L.ByteString         -- ^ Lazy target string
        -> [Int64]              -- ^ Offsets of matches
matchSL :: ByteString -> ByteString -> [Int64]
matchSL ByteString
pat = [ByteString] -> [Int64]
search ([ByteString] -> [Int64])
-> (ByteString -> [ByteString]) -> ByteString -> [Int64]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> [ByteString]
L.toChunks
  where
    search :: [ByteString] -> [Int64]
search = Bool -> ByteString -> [ByteString] -> [Int64]
forall a. Integral a => Bool -> ByteString -> [ByteString] -> [a]
matcher Bool
False ByteString
pat


------------------------------------------------------------------------------
--                                  Worker                                  --
------------------------------------------------------------------------------

{-# SPECIALISE matcher :: Bool -> S.ByteString -> [S.ByteString] -> [Int],
                          Bool -> S.ByteString -> [S.ByteString] -> [Int64] #-}
matcher :: Integral a => Bool -> S.ByteString -> [S.ByteString] -> [a]
matcher :: forall a. Integral a => Bool -> ByteString -> [ByteString] -> [a]
matcher Bool
_ !ByteString
pat
  | ByteString -> Bool
S.null ByteString
pat =  (a
0 a -> [a] -> [a]
forall a. a -> [a] -> [a]
:) ([a] -> [a]) -> ([ByteString] -> [a]) -> [ByteString] -> [a]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. a -> [ByteString] -> [a]
forall {t}. Num t => t -> [ByteString] -> [t]
go a
0
    where
      go :: t -> [ByteString] -> [t]
go t
_ [] = []
      go !t
prior (!ByteString
str : [ByteString]
rest) = [t
prior t -> t -> t
forall a. Num a => a -> a -> a
+ Int -> t
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
i | Int
i <- [Int
1 .. Int
l]]
                                  [t] -> [t] -> [t]
forall a. [a] -> [a] -> [a]
++ t -> [ByteString] -> [t]
go t
prior' [ByteString]
rest
        where
          !l :: Int
l = ByteString -> Int
S.length ByteString
str
          !prior' :: t
prior' = t
prior t -> t -> t
forall a. Num a => a -> a -> a
+ Int -> t
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
l
matcher !Bool
overlap ByteString
pat = a -> Int -> [ByteString] -> [a]
forall {p}. Num p => p -> Int -> [ByteString] -> [p]
searcher a
0 Int
0
  where
    !patLen :: Int
patLen = ByteString -> Int
S.length ByteString
pat
    !bords :: UArray Int Int
bords  = ByteString -> UArray Int Int
kmpBorders ByteString
pat
    !patH :: Word8
patH   = Int -> Word8
patAt Int
0
    {-# INLINE misi #-}
    misi :: Int -> Int
misi !Int
i = UArray Int Int -> Int -> Int
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> Int -> e
unsafeAt UArray Int Int
bords Int
i
    {-# INLINE patAt #-}
    patAt :: Int -> Word8
patAt !Int
i = ByteString -> Int -> Word8
unsafeIndex ByteString
pat Int
i
    !ami :: Int
ami    = if Bool
overlap then Int -> Int
misi Int
patLen else Int
0
    searcher :: p -> Int -> [ByteString] -> [p]
searcher p
_ Int
_ [] = []
    searcher !p
prior !Int
patPos (!ByteString
str : [ByteString]
rest)
      | Int
patPos Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0 = Int -> [p]
checkHead Int
0
      | Bool
otherwise = Int -> Int -> [p]
findMatch Int
patPos Int
0
      where
        !strLen :: Int
strLen = ByteString -> Int
S.length ByteString
str
        {-# INLINE strAt #-}
        strAt :: Int -> Word8
strAt !Int
i = ByteString -> Int -> Word8
unsafeIndex ByteString
str Int
i
        checkHead :: Int -> [p]
checkHead !Int
strI
            | Int
strI Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
strLen =
              p -> Int -> [ByteString] -> [p]
searcher (p
prior p -> p -> p
forall a. Num a => a -> a -> a
+ Int -> p
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
strLen) Int
0 [ByteString]
rest
            | Int -> Word8
strAt Int
strI Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
patH = Int -> Int -> [p]
findMatch Int
1 (Int
strI Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1)
            | Bool
otherwise = Int -> [p]
checkHead (Int
strI Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1)
        findMatch :: Int -> Int -> [p]
findMatch !Int
patI !Int
strI
            | Int
patI Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
patLen =
                (p
prior p -> p -> p
forall a. Num a => a -> a -> a
+ Int -> p
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
strI p -> p -> p
forall a. Num a => a -> a -> a
- Int -> p
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
patLen)
                    p -> [p] -> [p]
forall a. a -> [a] -> [a]
: if Int
ami Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0 then Int -> [p]
checkHead Int
strI else Int -> Int -> [p]
findMatch Int
ami Int
strI
            | Int
strI Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
strLen =
                p -> Int -> [ByteString] -> [p]
searcher (p
prior p -> p -> p
forall a. Num a => a -> a -> a
+ Int -> p
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
strLen) Int
patI [ByteString]
rest
            | Bool
otherwise      =
                if Int -> Word8
strAt Int
strI Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Int -> Word8
patAt Int
patI
                    then Int -> Int -> [p]
findMatch (Int
patI Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) (Int
strI Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1)
                    else case Int -> Int
misi Int
patI of
                            Int
0    -> Int -> [p]
checkHead Int
strI
                            (-1) -> Int -> [p]
checkHead (Int
strI Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1)
                            Int
pI   -> Int -> Int -> [p]
findMatch Int
pI Int
strI