Divergence.php 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. <?php
  2. namespace MathPHP\Statistics;
  3. use MathPHP\Exception;
  4. /**
  5. * Functions dealing with statistical divergence.
  6. * Related to probability and information theory and entropy.
  7. *
  8. * - Divergences
  9. * - Kullback-Leibler
  10. * - Jensen-Shannon
  11. *
  12. * In statistics and information geometry, divergence or a contrast function is a function which establishes the "distance"
  13. * of one probability distribution to the other on a statistical manifold. The divergence is a weaker notion than that of
  14. * the distance, in particular the divergence need not be symmetric (that is, in general the divergence from p to q is not
  15. * equal to the divergence from q to p), and need not satisfy the triangle inequality.
  16. *
  17. * https://en.wikipedia.org/wiki/Divergence_(statistics)
  18. */
  19. class Divergence
  20. {
  21. private const ONE_TOLERANCE = 0.010001;
  22. /**
  23. * Kullback-Leibler divergence
  24. * (also known as: discrimination information, information divergence, information gain, relative entropy, KLIC, KL divergence)
  25. * A measure of the difference between two probability distributions P and Q.
  26. * https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
  27. *
  28. * P(i)
  29. * Dkl(P‖Q) = ∑ P(i) log ----
  30. * ⁱ  Q(i)
  31. *
  32. *
  33. *
  34. * @param array<int|float> $p distribution p
  35. * @param array<int|float> $q distribution q
  36. *
  37. * @return float difference between distributions
  38. *
  39. * @throws Exception\BadDataException if p and q do not have the same number of elements
  40. * @throws Exception\BadDataException if p and q are not probability distributions that add up to 1
  41. */
  42. public static function kullbackLeibler(array $p, array $q): float
  43. {
  44. // Arrays must have the same number of elements
  45. if (\count($p) !== \count($q)) {
  46. throw new Exception\BadDataException('p and q must have the same number of elements');
  47. }
  48. // Probability distributions must add up to 1.0
  49. if ((\abs(\array_sum($p) - 1) > self::ONE_TOLERANCE) || (\abs(\array_sum($q) - 1) > self::ONE_TOLERANCE)) {
  50. throw new Exception\BadDataException('Distributions p and q must add up to 1');
  51. }
  52. // Defensive measures against taking the log of 0 which would be -∞ or dividing by 0
  53. $p = \array_map(
  54. function ($pᵢ) {
  55. return $pᵢ == 0 ? 1e-15 : $pᵢ;
  56. },
  57. $p
  58. );
  59. $q = \array_map(
  60. function ($qᵢ) {
  61. return $qᵢ == 0 ? 1e-15 : $qᵢ;
  62. },
  63. $q
  64. );
  65. // ∑ P(i) log(P(i)/Q(i))
  66. $Dkl⟮P‖Q⟯ = \array_sum(\array_map(
  67. function ($P, $Q) {
  68. return $P * \log($P / $Q);
  69. },
  70. $p,
  71. $q
  72. ));
  73. return $Dkl⟮P‖Q⟯;
  74. }
  75. /**
  76. * Jensen-Shannon divergence
  77. * Also known as: information radius (IRad) or total divergence to the average.
  78. * A method of measuring the similarity between two probability distributions.
  79. * It is based on the Kullback–Leibler divergence, with some notable (and useful) differences,
  80. * including that it is symmetric and it is always a finite value.
  81. * https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence
  82. *
  83. * 1 1
  84. * JSD(P‖Q) = - D(P‖M) + - D(Q‖M)
  85. * 2 2
  86. *
  87. * 1
  88. * where M = - (P + Q)
  89. * 2
  90. *
  91. * D(P‖Q) = Kullback-Leibler divergence
  92. *
  93. * @param array<int|float> $p distribution p
  94. * @param array<int|float> $q distribution q
  95. *
  96. * @return float difference between distributions
  97. *
  98. * @throws Exception\BadDataException if p and q do not have the same number of elements
  99. * @throws Exception\BadDataException if p and q are not probability distributions that add up to 1
  100. */
  101. public static function jensenShannon(array $p, array $q): float
  102. {
  103. // Arrays must have the same number of elements
  104. if (\count($p) !== \count($q)) {
  105. throw new Exception\BadDataException('p and q must have the same number of elements');
  106. }
  107. // Probability distributions must add up to 1.0
  108. if ((\abs(\array_sum($p) - 1) > self::ONE_TOLERANCE) || (\abs(\array_sum($q) - 1) > self::ONE_TOLERANCE)) {
  109. throw new Exception\BadDataException('Distributions p and q must add up to 1');
  110. }
  111. $M = \array_map(
  112. function ($pᵢ, $qᵢ) {
  113. return ($pᵢ + $qᵢ) / 2;
  114. },
  115. $p,
  116. $q
  117. );
  118. $½D⟮P‖M⟯ = self::kullbackLeibler($p, $M) / 2;
  119. $½D⟮Q‖M⟯ = self::kullbackLeibler($q, $M) / 2;
  120. return $½D⟮P‖M⟯ + $½D⟮Q‖M⟯;
  121. }
  122. }