Skip to content

Commit ca2a7c3

Browse files
committed
Add LMVSK types and folds
1 parent dc8abb0 commit ca2a7c3

File tree

1 file changed

+152
-0
lines changed

1 file changed

+152
-0
lines changed

Statistics/Sample/Fold.hs

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,17 @@ module Statistics.Sample.Fold
113113
-- * Joint distributions
114114
, covariance
115115
, correlation
116+
117+
-- ** Single pass Length, Mean, Variance, Skewness and Kurtosis
118+
-- $lmvsk
119+
, LMVSK(..)
120+
, LMVSKState
121+
, fastLMVSK
122+
, fastLMVSKu
123+
, foldLMVSKState
124+
, getLMVSK
125+
, getLMVSKu
126+
116127
-- * Strict types and helpers
117128
, F.fold
118129
, biExpectation
@@ -522,6 +533,144 @@ correlation (muX, muY) =
522533
(\(_,y) -> square (y - muY))
523534
{-# INLINE correlation #-}
524535

536+
-- $lmvsk
537+
--
538+
-- 'LMVSK' represents the Length, Mean, Variance, Skewness
539+
-- and Kurtosis of a sample. If you need to compute several
540+
-- of these statistics at once, using these folds may be more
541+
-- efficient than combining the 'Fold's above Applicatively.
542+
--
543+
-- The algorithm used is similar to the found in 'fastVariance' etc.
544+
-- but common subexpressions are shared.
545+
546+
data LMVSK = LMVSK
547+
{ lmvskLength :: {-# UNPACK #-}!Int
548+
, lmvskMean :: {-# UNPACK #-}!Double
549+
, lmvskVariance :: {-# UNPACK #-}!Double
550+
, lmvskSkewness :: {-# UNPACK #-}!Double
551+
, lmvskKurtosis :: {-# UNPACK #-}!Double
552+
} deriving (Show, Eq)
553+
554+
-- | Intermediate state for computing 'LMVSK'. This type's 'Semigroup'
555+
-- instance allows it to be computed in parallel and combined
556+
newtype LMVSKState = LMVSKState LMVSK
557+
558+
instance Monoid LMVSKState where
559+
{-# INLINE mempty #-}
560+
mempty = LMVSKState lmvskZero
561+
{-# INLINE mappend #-}
562+
mappend = (<>)
563+
564+
instance Semigroup LMVSKState where
565+
{-# INLINE (<>) #-}
566+
(LMVSKState (LMVSK an am1 am2 am3 am4)) <> (LMVSKState (LMVSK bn bm1 bm2 bm3 bm4))
567+
= LMVSKState (LMVSK n m1 m2 m3 m4) where
568+
fi :: Int -> Double
569+
fi = fromIntegral
570+
-- combined.n = a.n + b.n;
571+
n = an+bn
572+
n2 = n*n
573+
nd = fi n
574+
nda = fi an
575+
ndb = fi bn
576+
-- delta = b.M1 - a.M1;
577+
delta = bm1 - am1
578+
-- delta2 = delta*delta;
579+
delta2 = delta*delta
580+
-- delta3 = delta*delta2;
581+
delta3 = delta*delta2
582+
-- delta4 = delta2*delta2;
583+
delta4 = delta2*delta2
584+
-- combined.M1 = (a.n*a.M1 + b.n*b.M1) / combined.n;
585+
m1 = (nda*am1 + ndb*bm1 ) / nd
586+
-- combined.M2 = a.M2 + b.M2 + delta2*a.n*b.n / combined.n;
587+
m2 = am2 + bm2 + delta2*nda*ndb / nd
588+
-- combined.M3 = a.M3 + b.M3 + delta3*a.n*b.n* (a.n - b.n)/(combined.n*combined.n);
589+
m3 = am3 + bm3 + delta3*nda*ndb* fi( an - bn )/ fi n2
590+
-- combined.M3 += 3.0*delta * (a.n*b.M2 - b.n*a.M2) / combined.n;
591+
+ 3.0*delta * (nda*bm2 - ndb*am2 ) / nd
592+
--
593+
-- combined.M4 = a.M4 + b.M4 + delta4*a.n*b.n * (a.n*a.n - a.n*b.n + b.n*b.n) /(combined.n*combined.n*combined.n);
594+
m4 = am4 + bm4 + delta4*nda*ndb *fi(an*an - an*bn + bn*bn ) / fi (n*n*n)
595+
-- combined.M4 += 6.0*delta2 * (a.n*a.n*b.M2 + b.n*b.n*a.M2)/(combined.n*combined.n) +
596+
+ 6.0*delta2 * (nda*nda*bm2 + ndb*ndb*am2) / fi n2
597+
-- 4.0*delta*(a.n*b.M3 - b.n*a.M3) / combined.n;
598+
+ 4.0*delta*(nda*bm3 - ndb*am3) / nd
599+
600+
-- | Efficiently compute the
601+
-- __length, mean, variance, skewness and kurtosis__ with a single pass.
602+
{-# INLINE fastLMVSK #-}
603+
fastLMVSK :: F.Fold Double LMVSK
604+
fastLMVSK = getLMVSK <$> foldLMVSKState
605+
606+
-- | Efficiently compute the
607+
-- __length, mean, unbiased variance, skewness and kurtosis__ with a single pass.
608+
{-# INLINE fastLMVSKu #-}
609+
fastLMVSKu :: F.Fold Double LMVSK
610+
fastLMVSKu = getLMVSKu <$> foldLMVSKState
611+
612+
-- | Initial 'LMVSKState'
613+
{-# INLINE lmvskZero #-}
614+
lmvskZero :: LMVSK
615+
lmvskZero = LMVSK 0 0 0 0 0
616+
617+
-- | Performs the heavy lifting of fastLMVSK. This is exposed
618+
-- because the internal `LMVSKState` is monoidal, allowing you
619+
-- to run these statistics in parallel over datasets which are
620+
-- split and then combine the results.
621+
{-# INLINE foldLMVSKState #-}
622+
foldLMVSKState :: F.Fold Double LMVSKState
623+
foldLMVSKState = F.Fold stepLMVSKState (LMVSKState lmvskZero) id
624+
625+
{-# INLINE stepLMVSKState #-}
626+
stepLMVSKState :: LMVSKState -> Double -> LMVSKState
627+
stepLMVSKState (LMVSKState (LMVSK n1 m1 m2 m3 m4)) x = LMVSKState $ LMVSK n m1' m2' m3' m4' where
628+
fi :: Int -> Double
629+
fi = fromIntegral
630+
-- long long n1 = n;
631+
-- n++;
632+
n = n1+1
633+
-- delta = x - M1;
634+
delta = x - m1
635+
-- delta_n = delta / n;
636+
delta_n = delta / fi n
637+
-- delta_n2 = delta_n * delta_n;
638+
delta_n2 = delta_n * delta_n
639+
-- term1 = delta * delta_n * n1;
640+
term1 = delta * delta_n * fi n1
641+
-- M1 += delta_n;
642+
m1' = m1 + delta_n
643+
-- M4 += term1 * delta_n2 * (n*n - 3*n + 3) + 6 * delta_n2 * M2 - 4 * delta_n * M3;
644+
m4' = m4 + term1 * delta_n2 * fi (n*n - 3*n + 3) + 6 * delta_n2 * m2 - 4 * delta_n * m3
645+
-- M3 += term1 * delta_n * (n - 2) - 3 * delta_n * M2;
646+
m3' = m3 + term1 * delta_n * fi (n - 2) - 3 * delta_n * m2
647+
-- M2 += term1;
648+
m2' = m2 + term1
649+
650+
-- | Returns the stats which have been computed in a LMVSKState with /population/ variance.
651+
getLMVSK :: LMVSKState -> LMVSK
652+
getLMVSK (LMVSKState (LMVSK n m1 m2 m3 m4)) = LMVSK n m1 m2' m3' m4' where
653+
nd = fromIntegral n
654+
-- M2/(n-1.0)
655+
m2' = m2 / nd
656+
-- sqrt(double(n)) * M3/ pow(M2, 1.5)
657+
m3' = sqrt nd * m3 / (m2 ** 1.5)
658+
-- double(n)*M4 / (M2*M2) - 3.0
659+
m4' = nd*m4 / (m2*m2) - 3.0
660+
661+
-- | Returns the stats which have been computed in a LMVSKState,
662+
-- with the /sample/ variance.
663+
getLMVSKu :: LMVSKState -> LMVSK
664+
getLMVSKu (LMVSKState (LMVSK n m1 m2 m3 m4)) = LMVSK n m1 m2' m3' m4' where
665+
nd = fromIntegral n
666+
-- M2/(n-1.0)
667+
m2' = m2 / (nd-1)
668+
-- sqrt(double(n)) * M3/ pow(M2, 1.5)
669+
m3' = sqrt nd * m3 / (m2 ** 1.5)
670+
-- double(n)*M4 / (M2*M2) - 3.0
671+
m4' = nd*m4 / (m2*m2) - 3.0
672+
673+
525674
--------
526675

527676
-- $references
@@ -542,3 +691,6 @@ correlation (muX, muY) =
542691
-- * West, D.H.D. (1979) Updating mean and variance estimates: an
543692
-- improved method. /Communications of the ACM/
544693
-- 22(9):532&#8211;535. <http://doi.acm.org/10.1145/359146.359153>
694+
--
695+
-- * John D. Cook. Computing skewness and kurtosis in one pass
696+
-- <http://www.johndcook.com/blog/skewness_kurtosis/>

0 commit comments

Comments
 (0)