| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306 |
- <?php
- namespace MathPHP\Statistics;
- class Distribution
- {
- public const PRINT = true;
- /**
- * Frequency distribution
- * A table that displays the frequency of various outcomes in a sample.
- * Each entry in the table contains the frequency or count of the occurrences of values
- * within a particular group or interval.
- * The table summarizes the distribution of values in the sample.
- * https://en.wikipedia.org/wiki/Frequency_distribution
- *
- * The values of the input array will be the keys of the result array.
- * The count of the values will be the value of the result array for that key.
- *
- * @param array<scalar> $values Ex: ( A, A, A, B, B, C )
- *
- * @return array<scalar, int> frequency distribution Ex: ( A => 3, B => 2, C => 1 )
- */
- public static function frequency(array $values): array
- {
- $frequencies = array();
- foreach ($values as $value) {
- if (!isset($frequencies[$value])) {
- $frequencies[$value] = 1;
- } else {
- $frequencies[$value]++;
- }
- }
- return $frequencies;
- }
- /**
- * Relative frequency distribution
- * Frequency distribution relative to the sample size.
- *
- * Relative Frequency = Frequency / Sample Size
- *
- * The values of the input array will be the keys of the result array.
- * The relative frequency of the values will be the value of the result array for that key.
- *
- * @param array<scalar> $values Ex: ( A, A, A, A, A, A, B, B, B, C )
- *
- * @return array<scalar, float> relative frequency distribution Ex: ( A => 0.6, B => 0.3, C => 0.1 )
- */
- public static function relativeFrequency(array $values): array
- {
- $sample_size = \count($values);
- $relative_frequencies = array();
- foreach (self::frequency($values) as $subject => $frequency) {
- $relative_frequencies[$subject] = $frequency / $sample_size;
- }
- return $relative_frequencies;
- }
- /**
- * Cumulative frequency distribution
- *
- * The values of the input array will be the keys of the result array.
- * The cumulative frequency of the values will be the value of the result array for that key.
- *
- * @param array<scalar> $values Ex: ( A, A, A, A, A, A, B, B, B, C )
- *
- * @return array<scalar, int> cumulative frequency distribution Ex: ( A => 6, B => 9, C => 10 )
- */
- public static function cumulativeFrequency(array $values): array
- {
- $running_total = 0;
- $cumulative_frequencies = array();
- foreach (self::frequency($values) as $value => $frequency) {
- $running_total += $frequency;
- $cumulative_frequencies[$value] = $running_total;
- }
- return $cumulative_frequencies;
- }
- /**
- * Cumulative relative frequency distribution
- * Cumulative frequency distribution relative to the sample size.
- *
- * Cumulative relative frequency = cumulative frequency / sample size
- *
- * The values of the input array will be the keys of the result array.
- * The cumulative frequency of the values will be the value of the result array for that key.
- *
- * @param array<scalar> $values Ex: ( A, A, A, A, A, A, B, B, B, C )
- *
- * @return array<scalar, float> cumulative relative frequency distribution Ex: ( A => 0.6, B => 0.9, C => 1 )
- */
- public static function cumulativeRelativeFrequency(array $values): array
- {
- $sample_size = \count($values);
- $cumulative_frequencies = self::cumulativeFrequency($values);
- return \array_map(
- function ($frequency) use ($sample_size) {
- return $frequency / $sample_size;
- },
- $cumulative_frequencies
- );
- }
- /**
- * Assign a fractional average ranking to data - ("1 2.5 2.5 4" ranking)
- * https://en.wikipedia.org/wiki/Ranking
- *
- * Similar to R: rank(values, ties.method='average')
- *
- * @param array<scalar> $values to be ranked
- *
- * @return array<float> Rankings of the data in the same order the values were input
- */
- public static function fractionalRanking(array $values): array
- {
- $Xs = $values;
- \sort($Xs);
- // Determine ranks - some items might show up multiple times, so record each successive rank.
- $ordinalRanking⟮X⟯ = [];
- foreach ($Xs as $rank => $xᵢ) {
- $ordinalRanking⟮X⟯[\strval($xᵢ)][] = $rank + 1;
- }
- // Determine average rank of each value. Necessary when values show up multiple times.
- // Rank will not change if value only shows up once.
- $rg⟮X⟯ = \array_map(
- function (array $x) {
- return \array_sum($x) / \count($x);
- },
- $ordinalRanking⟮X⟯
- );
- // Map ranks to values in order they were originally input
- return \array_map(
- function ($value) use ($rg⟮X⟯) {
- return $rg⟮X⟯[\strval($value)];
- },
- $values
- );
- }
- /**
- * Assign a standard competitive ranking to data - ("1224" ranking)
- * https://en.wikipedia.org/wiki/Ranking
- *
- * Similar to R: rank(values, ties.method='min')
- *
- * @param array<scalar> $values to be ranked
- *
- * @return array<int> Rankings of the data in the same order the values were input
- */
- public static function standardCompetitionRanking(array $values): array
- {
- $count = \count($values);
- $Xs = $values;
- \sort($Xs);
- $ranking⟮X⟯ = [];
- $ranking⟮X⟯[0] = 1;
- for ($i = 1; $i < $count; $i++) {
- $ranking⟮X⟯[$i] = $Xs[$i] == $Xs[$i - 1]
- ? $ranking⟮X⟯[$i - 1]
- : $i + 1;
- }
- /** @var array<string, int<1, max>> $ranking⟮X⟯ */
- $ranking⟮X⟯ = \array_combine(\array_map('\strval', $Xs), $ranking⟮X⟯);
- // Map ranks to values in order they were originally input
- return \array_map(
- function ($value) use ($ranking⟮X⟯) {
- return $ranking⟮X⟯[\strval($value)];
- },
- $values
- );
- }
- /**
- * Assign a modified competitive ranking to data - ("1334" ranking)
- * https://en.wikipedia.org/wiki/Ranking
- *
- * Similar to R: rank(values, ties.method='max')
- *
- * @param array<scalar> $values to be ranked
- *
- * @return array<int> Rankings of the data in the same order the values were input
- */
- public static function modifiedCompetitionRanking(array $values): array
- {
- $count = \count($values);
- $Xs = $values;
- \sort($Xs);
- $ranking⟮X⟯ = [];
- $ranking⟮X⟯[$count - 1] = $count;
- for ($i = $count - 2; $i >= 0; $i--) {
- $ranking⟮X⟯[$i] = $Xs[$i] == $Xs[$i + 1]
- ? $ranking⟮X⟯[$i + 1]
- : $i + 1;
- }
- \sort($ranking⟮X⟯);
- /** @var array<string, int<0, max>> $ranking⟮X⟯ */
- $ranking⟮X⟯ = \array_combine(\array_map('\strval', $Xs), $ranking⟮X⟯);
- // Map ranks to values in order they were originally input
- return \array_map(
- function ($value) use ($ranking⟮X⟯) {
- return $ranking⟮X⟯[\strval($value)];
- },
- $values
- );
- }
- /**
- * Assign an ordinal ranking to data - ("1234" ranking)
- * https://en.wikipedia.org/wiki/Ranking
- *
- * Similar to R: rank(values, ties.method='first')
- *
- * @param array<scalar> $values to be ranked
- *
- * @return array<int> Rankings of the data in the same order the values were input
- */
- public static function ordinalRanking(array $values): array
- {
- $Xs = $values;
- \sort($Xs);
- $ranking⟮X⟯ = [];
- foreach ($Xs as $i => $x) {
- $ranking⟮X⟯[\strval($x)][] = $i + 1;
- }
- // Map ranks to values in order they were originally input
- $rankedValues = [];
- foreach ($values as $value) {
- $rankedValues[] = \array_shift($ranking⟮X⟯[\strval($value)]);
- }
- return $rankedValues;
- }
- /**
- * Stem and leaf plot
- * Device for presenting quantitative data in a graphical format, similar to a histogram,
- * to assist in visualizing the shape of a distribution.
- * https://en.wikipedia.org/wiki/Stem-and-leaf_display
- *
- * Returns an array with the keys as the stems, and the values are arrays containing the leaves.
- *
- * Optional parameter to print the stem and leaf plot.
- * Given input array: [ 44 46 47 49 63 64 66 68 68 72 72 75 76 81 84 88 106 ]
- * Prints:
- * 4 | 4 6 7 9
- * 5 |
- * 6 | 3 4 6 8 8
- * 7 | 2 2 5 6
- * 8 | 1 4 8
- * 9 |
- * 10 | 6
- *
- * @param array<int> $values
- * @param bool $print Optional setting to print the distribution
- *
- * @return array<int, array<int>> keys are the stems, values are the leaves
- */
- public static function stemAndLeafPlot(array $values, bool $print = false): array
- {
- // Split each value into stem and leaf
- \sort($values);
- $plot = array();
- foreach ($values as $value) {
- $stem = intdiv($value, 10);
- $leaf = $value % 10;
- if (!isset($plot[$stem])) {
- $plot[$stem] = array();
- }
- $plot[$stem][] = $leaf;
- }
- // Fill in any empty keys in the distribution we had no stem/leaves for
- $min = \min(\array_keys($plot));
- $max = \max(\array_keys($plot));
- for ($stem = $min; $stem <= $max; $stem++) {
- if (!isset($plot[$stem])) {
- $plot[$stem] = array();
- }
- }
- \ksort($plot);
- // Optionally print the stem and leaf plot
- if ($print === true) {
- $length = \max(\array_map(function ($stem) {
- return \strlen((string)$stem);
- }, \array_keys($plot)));
- foreach ($plot as $stem => $leaves) {
- \printf("%{$length}d | %s\n", $stem, \implode(' ', $leaves));
- }
- }
- return $plot;
- }
- }
|