"""unicode.py Playing around with strings, bytes, utf8, unicode, and all that in python 3. Strings (type ) are sequences of characters (usually unicode), not sequences of bytes. The ord() function returns the unicode "code point" (i.e. unicode numeric value) of the character. They are input with single or double quotes : 'abc' or "έψιλον". They can be read in from a file with open(filename,'r').read(). Strings have a .encode() method which turns them into a sequence of bytes. The default encoding is UTF-8, in which unusual characters get stored as multiple bytes, but common ASCII characters take one byte. Bytes (type ) are sequences of 8-bit bytes, not unicode characters. They can be typed as b'abc' or (for the greek characters above) b'\xce\xad\xcf\x88\xce\xb9\xce\xbb\xce\xbf\xce\xbd' where something like \xce means is hex byte 0xCE which is 206 in base 10. And they can be read from a file which has been opened in binary mode, e.g. open(filename,'rb').read(). Bytes have a .decode() method which turns them into character strings. (Note that in python 2.*, strings were just byte arrays - there were no unicode sequences or a bytes type.) This code gives some examples. Running it looks like this. $ python --version Python 3.7.3 $ python unicode.py -- string -- faces = ' 🤨 😟 ' type(faces) = len(faces) = 5 faces.encode() = b' \xf0\x9f\xa4\xa8 \xf0\x9f\x98\x9f ' -- s = open(,'r').read() -- type(s) = len(s) = 3932 repr( s[2030:2050] ) = 'es = " 🤨 😟 " #' -- bytes -- face_bytes = b' \xf0\x9f\xa4\xa8 \xf0\x9f\x98\x9f ' type(face_bytes) = len(face_bytes) = 11 face_bytes.decode() = ' 🤨 😟 ' -- b = open(,'rb').read() -- type(b) = len(b) = 4022 repr( b[1900:1930] ) = b"' + epsilon = ' epsilon \xce\xad\xcf\x88\xce\xb9" -- face with one raised eyebrow U+1F928 -- ord('🤨') = 129320 = 0x1f928 = unicode code point '🤨'.encode() = b'\xf0\x9f\xa4\xa8' -- non-ascii symbols in variable names -- έψιλον + ' ' + epsilon = ' epsilon έψιλον' (I used the repr() representation function above to avoid printing newlines.) *Now* we're having fun. Jim Mahoney | cs.marlboro.college | MIT License | Oct 2019 """ print() print(" -- string -- ") faces = " 🤨 😟 " # five characters: space, face, space, face, space print(" faces = '{}'".format(faces)) print(" type(faces) = ", type(faces)) print(" len(faces) = ", len(faces)) print(" faces.encode() = ", faces.encode()) print() print(" -- s = open(,'r').read() --") s = open('unicode.py', 'r').read() print(" type(s) = ", type(s)) print(" len(s) = ", len(s)) print(" repr( s[2030:2050] ) = {:s}".format(repr(s[2030:2050]))) print() print(" -- bytes -- ") face_bytes = " 🤨 😟 ".encode() print(" face_bytes = {}".format(face_bytes)) print(" type(face_bytes) = ", type(face_bytes)) print(" len(face_bytes) = ", len(face_bytes)) # lots more than five bytes print(" face_bytes.decode() = '{}'".format(face_bytes.decode())) print() print(" -- b = open(,'rb').read() --") b = open('unicode.py', 'rb').read() print(" type(b) = ", type(b)) print(" len(b) = ", len(b)) print(" repr( b[1900:1930] ) = {}".format(repr(b[1900:1930]))) print() # I like the "face with one eyebrow raised" unicode character, # U+1F928, also known as the "Colbert emoji". # See www.fileformat.info/info/unicode/char/1f928/index.htm # and unicode.org/emoji/charts/full-emoji-list.html print(" -- face with one raised eyebrow U+1F928 --") print(" ord('🤨') = {} = {} = unicode code point".format( ord('🤨'), hex(ord('🤨')))) print(" '🤨'.encode() = ", '🤨'.encode()) print() # You can put "symbol like" characters into variable names, # but many unicode symbols are not allowed in varaibles. # See https://docs.python.org/3.3/reference/lexical_analysis.html#identifiers # # For example this doesn't work, # giving "SyntaxError: invalid character in identifier" # __🤨__ = 3 # # But you can do the following, which looks Greek to me ... print(" -- non-ascii symbols in variable names --") έψιλον = "epsilon" epsilon = "έψιλον" print(" έψιλον + ' ' + epsilon = '", έψιλον + ' ' + epsilon + "'") print()