Distribution.php 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306
  1. <?php
  2. namespace MathPHP\Statistics;
  3. class Distribution
  4. {
  5. public const PRINT = true;
  6. /**
  7. * Frequency distribution
  8. * A table that displays the frequency of various outcomes in a sample.
  9. * Each entry in the table contains the frequency or count of the occurrences of values
  10. * within a particular group or interval.
  11. * The table summarizes the distribution of values in the sample.
  12. * https://en.wikipedia.org/wiki/Frequency_distribution
  13. *
  14. * The values of the input array will be the keys of the result array.
  15. * The count of the values will be the value of the result array for that key.
  16. *
  17. * @param array<scalar> $values Ex: ( A, A, A, B, B, C )
  18. *
  19. * @return array<scalar, int> frequency distribution Ex: ( A => 3, B => 2, C => 1 )
  20. */
  21. public static function frequency(array $values): array
  22. {
  23. $frequencies = array();
  24. foreach ($values as $value) {
  25. if (!isset($frequencies[$value])) {
  26. $frequencies[$value] = 1;
  27. } else {
  28. $frequencies[$value]++;
  29. }
  30. }
  31. return $frequencies;
  32. }
  33. /**
  34. * Relative frequency distribution
  35. * Frequency distribution relative to the sample size.
  36. *
  37. * Relative Frequency = Frequency / Sample Size
  38. *
  39. * The values of the input array will be the keys of the result array.
  40. * The relative frequency of the values will be the value of the result array for that key.
  41. *
  42. * @param array<scalar> $values Ex: ( A, A, A, A, A, A, B, B, B, C )
  43. *
  44. * @return array<scalar, float> relative frequency distribution Ex: ( A => 0.6, B => 0.3, C => 0.1 )
  45. */
  46. public static function relativeFrequency(array $values): array
  47. {
  48. $sample_size = \count($values);
  49. $relative_frequencies = array();
  50. foreach (self::frequency($values) as $subject => $frequency) {
  51. $relative_frequencies[$subject] = $frequency / $sample_size;
  52. }
  53. return $relative_frequencies;
  54. }
  55. /**
  56. * Cumulative frequency distribution
  57. *
  58. * The values of the input array will be the keys of the result array.
  59. * The cumulative frequency of the values will be the value of the result array for that key.
  60. *
  61. * @param array<scalar> $values Ex: ( A, A, A, A, A, A, B, B, B, C )
  62. *
  63. * @return array<scalar, int> cumulative frequency distribution Ex: ( A => 6, B => 9, C => 10 )
  64. */
  65. public static function cumulativeFrequency(array $values): array
  66. {
  67. $running_total = 0;
  68. $cumulative_frequencies = array();
  69. foreach (self::frequency($values) as $value => $frequency) {
  70. $running_total += $frequency;
  71. $cumulative_frequencies[$value] = $running_total;
  72. }
  73. return $cumulative_frequencies;
  74. }
  75. /**
  76. * Cumulative relative frequency distribution
  77. * Cumulative frequency distribution relative to the sample size.
  78. *
  79. * Cumulative relative frequency = cumulative frequency / sample size
  80. *
  81. * The values of the input array will be the keys of the result array.
  82. * The cumulative frequency of the values will be the value of the result array for that key.
  83. *
  84. * @param array<scalar> $values Ex: ( A, A, A, A, A, A, B, B, B, C )
  85. *
  86. * @return array<scalar, float> cumulative relative frequency distribution Ex: ( A => 0.6, B => 0.9, C => 1 )
  87. */
  88. public static function cumulativeRelativeFrequency(array $values): array
  89. {
  90. $sample_size = \count($values);
  91. $cumulative_frequencies = self::cumulativeFrequency($values);
  92. return \array_map(
  93. function ($frequency) use ($sample_size) {
  94. return $frequency / $sample_size;
  95. },
  96. $cumulative_frequencies
  97. );
  98. }
  99. /**
  100. * Assign a fractional average ranking to data - ("1 2.5 2.5 4" ranking)
  101. * https://en.wikipedia.org/wiki/Ranking
  102. *
  103. * Similar to R: rank(values, ties.method='average')
  104. *
  105. * @param array<scalar> $values to be ranked
  106. *
  107. * @return array<float> Rankings of the data in the same order the values were input
  108. */
  109. public static function fractionalRanking(array $values): array
  110. {
  111. $Xs = $values;
  112. \sort($Xs);
  113. // Determine ranks - some items might show up multiple times, so record each successive rank.
  114. $ordinalRanking⟮X⟯ = [];
  115. foreach ($Xs as $rank => $xᵢ) {
  116. $ordinalRanking⟮X⟯[\strval($xᵢ)][] = $rank + 1;
  117. }
  118. // Determine average rank of each value. Necessary when values show up multiple times.
  119. // Rank will not change if value only shows up once.
  120. $rg⟮X⟯ = \array_map(
  121. function (array $x) {
  122. return \array_sum($x) / \count($x);
  123. },
  124. $ordinalRanking⟮X⟯
  125. );
  126. // Map ranks to values in order they were originally input
  127. return \array_map(
  128. function ($value) use ($rg⟮X⟯) {
  129. return $rg⟮X⟯[\strval($value)];
  130. },
  131. $values
  132. );
  133. }
  134. /**
  135. * Assign a standard competitive ranking to data - ("1224" ranking)
  136. * https://en.wikipedia.org/wiki/Ranking
  137. *
  138. * Similar to R: rank(values, ties.method='min')
  139. *
  140. * @param array<scalar> $values to be ranked
  141. *
  142. * @return array<int> Rankings of the data in the same order the values were input
  143. */
  144. public static function standardCompetitionRanking(array $values): array
  145. {
  146. $count = \count($values);
  147. $Xs = $values;
  148. \sort($Xs);
  149. $ranking⟮X⟯ = [];
  150. $ranking⟮X⟯[0] = 1;
  151. for ($i = 1; $i < $count; $i++) {
  152. $ranking⟮X⟯[$i] = $Xs[$i] == $Xs[$i - 1]
  153. ? $ranking⟮X⟯[$i - 1]
  154. : $i + 1;
  155. }
  156. /** @var array<string, int<1, max>> $ranking⟮X⟯ */
  157. $ranking⟮X⟯ = \array_combine(\array_map('\strval', $Xs), $ranking⟮X⟯);
  158. // Map ranks to values in order they were originally input
  159. return \array_map(
  160. function ($value) use ($ranking⟮X⟯) {
  161. return $ranking⟮X⟯[\strval($value)];
  162. },
  163. $values
  164. );
  165. }
  166. /**
  167. * Assign a modified competitive ranking to data - ("1334" ranking)
  168. * https://en.wikipedia.org/wiki/Ranking
  169. *
  170. * Similar to R: rank(values, ties.method='max')
  171. *
  172. * @param array<scalar> $values to be ranked
  173. *
  174. * @return array<int> Rankings of the data in the same order the values were input
  175. */
  176. public static function modifiedCompetitionRanking(array $values): array
  177. {
  178. $count = \count($values);
  179. $Xs = $values;
  180. \sort($Xs);
  181. $ranking⟮X⟯ = [];
  182. $ranking⟮X⟯[$count - 1] = $count;
  183. for ($i = $count - 2; $i >= 0; $i--) {
  184. $ranking⟮X⟯[$i] = $Xs[$i] == $Xs[$i + 1]
  185. ? $ranking⟮X⟯[$i + 1]
  186. : $i + 1;
  187. }
  188. \sort($ranking⟮X⟯);
  189. /** @var array<string, int<0, max>> $ranking⟮X⟯ */
  190. $ranking⟮X⟯ = \array_combine(\array_map('\strval', $Xs), $ranking⟮X⟯);
  191. // Map ranks to values in order they were originally input
  192. return \array_map(
  193. function ($value) use ($ranking⟮X⟯) {
  194. return $ranking⟮X⟯[\strval($value)];
  195. },
  196. $values
  197. );
  198. }
  199. /**
  200. * Assign an ordinal ranking to data - ("1234" ranking)
  201. * https://en.wikipedia.org/wiki/Ranking
  202. *
  203. * Similar to R: rank(values, ties.method='first')
  204. *
  205. * @param array<scalar> $values to be ranked
  206. *
  207. * @return array<int> Rankings of the data in the same order the values were input
  208. */
  209. public static function ordinalRanking(array $values): array
  210. {
  211. $Xs = $values;
  212. \sort($Xs);
  213. $ranking⟮X⟯ = [];
  214. foreach ($Xs as $i => $x) {
  215. $ranking⟮X⟯[\strval($x)][] = $i + 1;
  216. }
  217. // Map ranks to values in order they were originally input
  218. $rankedValues = [];
  219. foreach ($values as $value) {
  220. $rankedValues[] = \array_shift($ranking⟮X⟯[\strval($value)]);
  221. }
  222. return $rankedValues;
  223. }
  224. /**
  225. * Stem and leaf plot
  226. * Device for presenting quantitative data in a graphical format, similar to a histogram,
  227. * to assist in visualizing the shape of a distribution.
  228. * https://en.wikipedia.org/wiki/Stem-and-leaf_display
  229. *
  230. * Returns an array with the keys as the stems, and the values are arrays containing the leaves.
  231. *
  232. * Optional parameter to print the stem and leaf plot.
  233. * Given input array: [ 44 46 47 49 63 64 66 68 68 72 72 75 76 81 84 88 106 ]
  234. * Prints:
  235. * 4 | 4 6 7 9
  236. * 5 |
  237. * 6 | 3 4 6 8 8
  238. * 7 | 2 2 5 6
  239. * 8 | 1 4 8
  240. * 9 |
  241. * 10 | 6
  242. *
  243. * @param array<int> $values
  244. * @param bool $print Optional setting to print the distribution
  245. *
  246. * @return array<int, array<int>> keys are the stems, values are the leaves
  247. */
  248. public static function stemAndLeafPlot(array $values, bool $print = false): array
  249. {
  250. // Split each value into stem and leaf
  251. \sort($values);
  252. $plot = array();
  253. foreach ($values as $value) {
  254. $stem = intdiv($value, 10);
  255. $leaf = $value % 10;
  256. if (!isset($plot[$stem])) {
  257. $plot[$stem] = array();
  258. }
  259. $plot[$stem][] = $leaf;
  260. }
  261. // Fill in any empty keys in the distribution we had no stem/leaves for
  262. $min = \min(\array_keys($plot));
  263. $max = \max(\array_keys($plot));
  264. for ($stem = $min; $stem <= $max; $stem++) {
  265. if (!isset($plot[$stem])) {
  266. $plot[$stem] = array();
  267. }
  268. }
  269. \ksort($plot);
  270. // Optionally print the stem and leaf plot
  271. if ($print === true) {
  272. $length = \max(\array_map(function ($stem) {
  273. return \strlen((string)$stem);
  274. }, \array_keys($plot)));
  275. foreach ($plot as $stem => $leaves) {
  276. \printf("%{$length}d | %s\n", $stem, \implode(' ', $leaves));
  277. }
  278. }
  279. return $plot;
  280. }
  281. }